%{ #include /* Formal specification of OpenFormula syntax, in bison/yacc form. by David A. Wheeler. Released under the "MIT license": Copyright (c) 2005-2006 David A. Wheeler Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Notes: + Doesn't include namespace processing, which isn't really part of this. Old Notes - check if true: + See OpenOffice specification 22 March 2004, pg 171, section 8.1.3. + Note that the formula syntax used for exchange is NOT necessarily the format displayed to users. E.G., users might see "A1" displayed, meaning cell A1, but that'd be stored and exchanged as "[.A1]". This is perhaps unfortunate, since this means that the exchange format is always different than what is shown the user (beyond locale differences) + Ordinary precedence and parentheses are allowed; 3+2*5 is 13. + You can label cells or cell ranges with IDENTIFIERs, and then refer to them in the spreadsheet; simply use the IDENTIFIER. + Labels may refer to ranges of cells. Currently can't have label:label; that seems like an unnecessary weakness. + Function calls always include the parentheses, and can be given zero or more parameters. If there are parameters, none can be empty. RAND() is legal, and so is SUM(.A1;2;3). RAND is not a legal function call (need the parens). + Note: Parameter separators are semicolons, not commas. This is rather inconsistent with the rest of the world, because Commas are much more common separators for parameters. E.G., C, C++, C#, Java, Ada, Fortran. I presume that the UI uses semicolon so that users with locales that use "," as the decimal point won't get confused. This isn't strictly necessary here, since numbers are C locale, but there IS an advantage to having the internal syntax being similar to what's actually stored and received. But shouldn't the READERS at least understand "," since it's a likely mistake? + Function call names appear to be transformed to all upper case before being transferred, but readers should probably accept either case and NOT treat case as significant (so "sum" and "SUM" are the same). At least, I _think_ that's true - need to confirm. + The logical operators are IF(condition;if-true;if-false), AND(...), OR(...), NOT(x), TRUE(), and FALSE(). + OOo uses stronger typing than other spreadsheets, e.g., OR(1,2) isn't TRUE, but an error. Is that really a good idea, since it will probably impede interoperability and user expectations? + This grammar treats logical operators and function calls as identical, since they have the same syntax. However, I believe the logical operators IF(), AND(), and OR() must short-circuit, e.g., not do excess computational work. As long as there are no functions with side-effects it doesn't matter, but some user-defined functions in some languages COULD have side-effects. An implementation must operate as though it short-circuits, though it may actually compute in parallel as long as only side-effect-free operations are performed (it may not be possible to ensure that some user-defined functions are side-effect free). + Unary minus and plus are allowed, so "-[.B1]" is legal. Here they're handled by the grammar, not the lexer. + Comparison operations (x=y, etc.) return TRUE() or FALSE(). + The operators +,-,*,/,^ convert booleans on either side to a number (false->0, true->1). So (3>2)*5 is 5. */ void process(char *s); void yyerror(char *s); int yydebug=1; %} %token NUMBER STRING IDENTIFIER %token SINGLEQUOTED DOLLARDOLLAR BANGBANG /* Define precedence and assocation direction */ %left GE LE EQ NE '>' '<' '=' %left '&' %left '+' '-' %left '*' '/' %left '^' %nonassoc UMINUS %left '~' %left '!' %left ':' %% start: formula {process("Accepted");} ; formula: '=' expr ; // Expression ::= Number | // String | // Array | // PrefixOp Expression | // Expression PostfixOp | // Expression InfixOp Expression | // '(' Expression ')' | // FunctionName '(' ParameterList ')' | // Reference | // AutomaticIntersection | // NamedExpression | // Error // AutomaticIntersection ::= QuotedLabel '!!' QuotedLabel // QuotedLabel ::= "'" ([^'] | "''")* "'" // Expand the operations in place here, it simplifies things considerably. expr: NUMBER {process("Constant number");} | STRING {process("Constant string");} | array {process("Inline array done");} | expr ':' expr {process("Range");} | expr '!' expr {process("Intersection");} | expr '~' expr {process("Reference union");} | '-' expr %prec UMINUS {process("Unary minus");} | '+' expr %prec UMINUS {process("Unary plus");} | expr '%' {process("Percent");} | expr '^' expr {process("String concatenation");} | expr '*' expr {process("Multiply");} | expr '/' expr {process("Divide");} | expr '+' expr {process("add");} | expr '-' expr {process("Subtract");} | expr '&' expr {process("String concatenation");} | expr '<' expr {process("Less-than?");} | expr '>' expr {process("Greater-than?");} | expr GE expr {process("Greater-than-or-equal-to?");} | expr LE expr {process("Less-than-or-equal-to?");} | expr NE expr {process("Not-equal?");} | expr EQ expr {process("Equal?");} | '(' expr ')' {} | IDENTIFIER '(' parameter_list ')' {process("Function call");} | reference {process("Cell address(es)");} | namedExpression {process("Named expression");} | inlineError {process("Error.");} | SINGLEQUOTED BANGBANG SINGLEQUOTED {process("Automatic intersection.");} ; // ParameterList ::= {empty} | // Parameter (Separator EmptyOrParameter )* | // Separator EmptyOrParameter {First param empty} // (Separator EmptyOrParameter )* parameter_list: /* empty */ | parameter rest_of_list | ';' empty_or_parameter rest_of_list /* Handle first-param-empty */ {process("First param was empty");} ; parameter: expr {process("Function parameter");} ; rest_of_list: /* empty */ | ';' empty_or_parameter rest_of_list ; /* empty_or_parameter can be empty, as can the start of rest_of_list. That's not a problem for the definition of rest_of_list, though. at the beginning of rest_of_list, the only legal nonwhitespace is ';' (keeping us in rest_of_list) or ')' (which ends the function parameter list). Thus it's not ambiguous. */ empty_or_parameter: /* empty */ {process("Empty param");} | parameter ; // Reference ::= '[' Source? RangeAddress ']' reference: '[' optionalSource rangeAddress ']' ; // Spec says: // // RangeAddress ::= // SheetLocator "." Column Row (':' SheetLocator "." Column Row )? | // SheetLocator "." Column ':' SheetLocator "." Column | // SheetLocator "." Row ':' SheetLocator "." Row // Column ::= '$'? [A-Z]+ // Row ::= '$'? [1-9] [0-9]* // Easier to implement as: // RangeAddress ::= Locator (':' Locator ) // Locator ::= SheetName ('.' Component)+ // Component ::= '$'? Column ('$' Row)? | '$' Row | // Column ::= '$'? [A-Z]+ // Row ::= '$'? [1-9] [0-9]* // It's easier to accept components (even if illegal at first) and go back // to check them, because finding the last component is easy - // it's the last one we read in! // This syntax permits references to whole-sheets (as well as // whole-row and whole-column) - should that be accepted? // This syntax accepts references to a whole row/column WITHOUT requiring a // range marker; it can do that unambiguously because we know this cannot // be a named range. Still, I think the EXCHANGE format should require // the marker, because most UI's will require it AND it makes the // point clearer. rangeAddress: locator optRange ; optRange: /* empty */ | ':' locator ; locator: sheetName componentlist ; componentlist: moreComponentlist '.' component; moreComponentlist: /* empty */ | moreComponentlist '.' component; component: optdollar IDENTIFIER | /* Lie - really only Column or Column no-$ Row is okay. */ optdollar IDENTIFIER '$' NUMBER | /* Lie - really only Column $ Row is okay here. */ optdollar NUMBER | /* Lie - really only Row is okay here. */ quotedSheetName ; sheetName: /* empty */ | quotedSheetName | optdollar IDENTIFIER ; /* TODO: Ok, that's cheating, SheetName allows more, and it's a pain to handle it here, because the dollar is optional (so we can't depend on a particular char preceding this construct to help us easily switch modes). Maybe we should require IDENTIFIER here, and anything else has to be quoted? */ /* TODO: IDENTIFIER accepts ".", unquoted sheet name can't. */ quotedSheetName: optdollar SINGLEQUOTED | error ; optdollar: /* empty */ | '$' ; // The original text is this - the problem is that we can't know // without multi-token lookahead if a given component is the last one. // E.G., when I see "S1.A1.", I can't know if the next component is the // last one or not. So it's easier to read in all the components, and // then obviously the last component read in is the last component. // Reference ::= '[' Source? RangeAddress ']' // RangeAddress ::= // SheetLocator "." Column Row (':' SheetLocator "." Column Row )? | // SheetLocator "." Column ':' SheetLocator "." Column | // SheetLocator "." Row ':' SheetLocator "." Row // Column ::= '$'? [A-Z]+ // Row ::= '$'? [1-9] [0-9]* // rangeAddress: sheetLocator '.' postRangeAddress ; // // // postRangeAddress: optdollar IDENTIFIER optdollarnumber secondHalf // /* IDENTIFIER is a lie - it's really Column Row?, // but we'll let postprocessing check that. */ // | optdollar NUMBER ':' optdollar NUMBER ; // /* Here, NUMBER must be integer [0-9]+ and nonzero */ // // secondHalf: /* empty */ // | ':' sheetLocator '.' IDENTIFIER ; // /* IDENTIFIER is a lie, only Column Row? okay, and // whether or not there's a row depends on the predecessor. // We'll let postprocessing check all that. */ // // optdollarnumber: /* empty */ // | '$' NUMBER ; // /* Here, NUMBER must be integer [0-9]+ and nonzero */ // SheetLocator ::= SheetName ("." SubtableCell)* // SheetName ::= QuotedSheetName | '$'? [^\. #$']+ | /*empty */ // QuotedSheetName ::= '$'? "'" ([^'] | "''")+ "'" | Error // SubtableCell ::= ( Column Row ) | QuotedSheetName // sheetLocator: sheetName subTable // // sheetLocator: IDENTIFIER ; // // Source ::= "'" IRI "'" "#" optionalSource: /*empty*/ | SINGLEQUOTED '#' ; // source: SINGLEQUOTED '#' ; // ORIGINAL: // NamedExpression ::= Source? // ( QuotedSheetName ("." SubtableCell)* "." )? // ExpIdentifier // ExpIdentifier ::= "$$"? Identifier | "$$'" ([^']|'')* "'" // // Problem: In parsing S1.A1.A1, the first one is a column/row // and the second is the name, but we can't easily tell that, // Locator ::= QuotedSheetName ('.' Component)+ // Component ::= '$'? Column ('$' Row)? | '$' Row | // QuotedSheetName // Column ::= '$'? [A-Z]+ // Row ::= '$'? [1-9] [0-9]* // // (Similar problem - can't know if the "SubtableCell" we see is the last // one or not until we've already parsed it.) // NamedExpression ::= Source? // ( QuotedSheetName ("." SubtableCell)* "." )? // ExpIdentifier // ExpIdentifier ::= "$$"? Identifier | "$$'" ([^']|'')* "'" // namedExpression: optionalSource optionalQuotedLocator expIdentifier ; // // expIdentifier: DOLLARDOLLAR IDENTIFIER | // DOLLARDOLLAR SINGLEQUOTED | // IDENTIFIER ; // // optionalQuotedLocator: /* empty */ | // quotedSheetName subTable '.' ; // // subTable: /* empty */ | // '.' subTableCell subTable ; // // subTableCell: IDENTIFIER | // /* A lie - really only Column Row is okay here. */ // quotedSheetName ; // Try1: need to REQUIRE that if there's a source, // there's a sheetname. Reasonable enough. // NamedExpression ::= Source ( MarkedIdentifier | QuotedSheetName // 'Source'# // $ Sheetname . // 'SheetName' . // ('.' (Cell | QuotedSheetName | QuotedNamedExpr))+ // $$'NamedExpr' // $$NamedExpr // 'Source'#$SheetName.A1 // 'Source'#SheetName.A1 // 'Source'#NamedExpr // 'Source'#$SheetName.$$NamedExpr // 'Source'#SheetName.$$'NamedExpr' // ( Source QuotedSheetName ("." SubtableCell)* "." )? // ExpIdentifier // ExpIdentifier ::= "$$"? Identifier | "$$'" ([^']|'')* "'" // Again, hard to find the end. Changed to: // NamedExpression ::= Source? ( '.' ExprComponent)+ // ExprComponent ::= QuotedSheetName | NoDotIdentifier // (might also be row/col!) // '$' Column '$'? Row | // CONFLICT - Column Row with NoDotIdentifier // "$$" ( NoDotIdentifier | singlequoted ) // // namedExpression: optionalSource namedExprComponentList ; // // expIdentifier: DOLLARDOLLAR IDENTIFIER | // DOLLARDOLLAR SINGLEQUOTED | // IDENTIFIER ; // // optionalQuotedLocator: /* empty */ | // quotedSheetName subTable '.' ; // // subTable: /* empty */ | // '.' subTableCell subTable ; // // subTableCell: IDENTIFIER | // /* A lie - really only Column Row is okay here. */ // quotedSheetName ; // // // namedExprComponentlist: moreNamedExprComponentlist '.' component; // // moreComponentlist: /* empty */ | moreNamedExprComponentlist '.' component; // // component: optdollar IDENTIFIER | // /* Lie - really only Column or Column no-$ Row is okay. */ // optdollar IDENTIFIER '$' NUMBER | // /* Lie - really only Column $ Row is okay here. */ // optdollar NUMBER | // /* Lie - really only Row is okay here. */ // quotedSheetName ; // // // // sheetName: /* empty */ | // quotedSheetName | // optdollar IDENTIFIER ; // /* TODO: Ok, that's cheating, SheetName allows more, // and it's a pain to handle it here, because the dollar is // optional (so we can't depend on a particular char // preceding this construct to help us easily switch modes). // Maybe we should require IDENTIFIER here, and anything // else has to be quoted? */ // /* TODO: IDENTIFIER accepts ".", unquoted sheet name can't. */ // // quotedSheetName: optdollar SINGLEQUOTED | // error ; // // Try2: // Realize that we're trying too hard. Presume that a named // expression is always attached to the entire workbook, or to a sheet, // not to a cell or subcell. Also presume that a workbook contains // sheets, but only one level - don't try to handle sheets in cells or // sheets in sheets. And it's okay to FORCE use of a marker for // an external namedexpression. // Note that '$$' is optional if no source or sheet given. // We'll REQUIRE the '$$' if the source or sheet is given, // because "Source Identifier" looks very ambiguous // (is Identifier a sheetname or Identifier?), and we want it crystal-clear. // NamedExpression ::= Identifier | NamedExprLocator? // '$$' (Identifier | SINGLEQUOTED) // ExpLocator ::= (Source '#')? (QuotedSheetName '.')? // It _looks_ like you'd be able to do this in bison/yacc: // namedExpression: IDENTIFIER | optionalSource optionalQuotedSheetName // DOLLARDOLLAR identifierOrSingleQuoted ; // optionalQuotedSheetName: /* empty */ | quotedSheetName '.' ; // But this doesn't directly work in bison, because since optionalSource and // optionalQuoatedSheetName both begin with SINGLEQUOTED, bison won't // know what rule to descend into. // // Another way to implement this is: // NamedExpression ::= Identifier | // (SINGLEQUOTED ('.' /* Previous was sheetname*/ | // '#' /* Prev was URL */ // (QuotedSheetName '.')?)) // '$$' (Identifier | SINGLEQUOTED) // namedExpression: IDENTIFIER | namedExprLocator DOLLARDOLLAR identifierOrSingleQuoted ; namedExprLocator: /*empty*/ | SINGLEQUOTED '.' {process("SheetName");} | SINGLEQUOTED '#' optionalQuotedSheetName {process("External");} ; optionalQuotedSheetName: /* empty */ | quotedSheetName '.' {process("SheetName");} ; identifierOrSingleQuoted: IDENTIFIER | SINGLEQUOTED ; // // expIdentifier: DOLLARDOLLAR IDENTIFIER | // DOLLARDOLLAR SINGLEQUOTED | // IDENTIFIER ; // // optionalQuotedLocator: /* empty */ | // quotedSheetName subTable '.' ; // // subTable: /* empty */ | // '.' subTableCell subTable ; // // subTableCell: IDENTIFIER | // /* A lie - really only Column Row is okay here. */ // quotedSheetName ; // // // namedExprComponentlist: moreNamedExprComponentlist '.' component; // // moreComponentlist: /* empty */ | moreNamedExprComponentlist '.' component; // // component: optdollar IDENTIFIER | // /* Lie - really only Column or Column no-$ Row is okay. */ // optdollar IDENTIFIER '$' NUMBER | // /* Lie - really only Column $ Row is okay here. */ // optdollar NUMBER | // /* Lie - really only Row is okay here. */ // quotedSheetName ; // // // // sheetName: /* empty */ | // quotedSheetName | // optdollar IDENTIFIER ; // /* TODO: Ok, that's cheating, SheetName allows more, // and it's a pain to handle it here, because the dollar is // optional (so we can't depend on a particular char // preceding this construct to help us easily switch modes). // Maybe we should require IDENTIFIER here, and anything // else has to be quoted? */ // /* TODO: IDENTIFIER accepts ".", unquoted sheet name can't. */ // // quotedSheetName: optdollar SINGLEQUOTED | // error ; // // TODO /* ERRORNAME - placeholder */ // Error ::= '#' [A-Z0-9]+ ([!?] | ('/' ([A-Z] | ([0-9] [!?])))) // Can't call this rule "error", that term has a special meaning in bison! inlineError: '#' {/* Switch to error processing */} IDENTIFIER /* ERRORNAME */ {/*switch back*/} ; /* TODO: Fix for new syntax what's below! */ // Array ::= '{' Matrix ( '_' Matrix )* '}' // Matrix ::= ( MatrixRow ( RowSeparator MatrixRow )* ) ? // MatrixRow ::= Expression ( ';' Expression )* // RowSeparator ::= '|' // NOTE: This drops 3D, per discussion, so it's really just // Array ::= '{' Row (RowSeparator Row)* '}' // Row ::= Expression ( ';' Expression )* // RowSeparator ::= '|' array: '{' row nextRow '}' ; nextRow: /* empty */ | '|' row nextRow ; row: expr nextExpr ; nextExpr: /* empty */ | ';' expr nextExpr ; %% void process(char *s) { fprintf(stdout, "%s\n", s); } void yyerror(char *s) { fprintf(stdout, "%s\n", s); } int main(void) { yyparse(); return 0; }