#include <stdio.h>
#include <string.h>
#ifndef GLOBAL_DECL
#include "global.h" // Global data types and variables
#endif
parser::parser(getter* caller, config* setts) // Constructor
{
settings = setts;
supervisor = settings -> supervisor;
show_extern = settings -> show_extern;
parent = caller;
}
// ******** Lexical Analyser *********
bool parser::get_lexeme(char* result)
{
int loop = 0, loop2 = 0; // Loop variables
char new_char = NULL; // New character read
// On first call to function fill the buffer
if (!buffer_init)
{
for (loop = 0; loop < BUFFER_SIZE; loop++)
buffer[loop] = source -> read();
buffer_init = true;
head = 0;
}
// Check if the end of file has been reached and the buffer exhausted
if ((buffer[head] == EOF) || (buffer[head] == NULL))
return false;
// Extra spaces and carriage returns before lexemes are ignored
// Read characters until start of lexeme found
while ((buffer[head] == ' ') || (buffer[head] == '\n'))
{
read_file();
}
// Search through to find end of lexeme
loop = head;
do
{
switch ( buffer[loop] )
{
case ' ':
case '>':
case '<':
case '=':
case '\"':
case '\n':
case '&':
case ';':
case NULL:
case EOF:
if (loop == head) // lexeme is only 1 character..
{
result[0] = buffer[loop];
result[1] = NULL;
read_file();
return true;
}
else // End of lexeme found
{
loop2 = 0;
do
{
result[loop2++] = buffer[head];
read_file();
} while (head != loop);
result[loop2] = NULL;
return true;
}
break;
}
loop++;
if (loop==BUFFER_SIZE) loop-=BUFFER_SIZE;
} while ( loop != head );
// No lexeme ending found! - Lexeme must be longer than the buffer.
// Return entire buffer as a lexeme
loop2 = 0;
do
{
result[loop2++] = buffer[head];
read_file();
} while (loop2 < BUFFER_SIZE-1);
result[loop2] = NULL;
return true;
}
void parser::get_trigger()
{
block_opts* current;
current = pg_block_opts -> get_item();
if (current != NULL)
strcpy(trigger, current -> trigger_text);
else
trigger[0] = NULL;
}
void parser::setup_blocks()
{
block_opts* current;
trigger[0] = NULL;
pg_block_opts -> move_first();
current = pg_block_opts -> get_item();
if (current != NULL)
{
strcpy(trigger, current -> trigger_text);
if (current -> start_ignored)
ignore_block();
}
}
void parser::read_file()
{
int loop, loop2 = 0;
bool found = true;
// Read a new character into the buffer
buffer[head++] = source -> read();
if (head==BUFFER_SIZE) head-=BUFFER_SIZE;
// Search for start or end of ignored section
// This is a hideously high overhead in terms of processing time
if (trigger[0] != NULL)
{
loop = head;
do
{
if (trigger[loop2++] != buffer[loop++])
found = false;
if (loop==BUFFER_SIZE) loop-=BUFFER_SIZE;
} while ((loop2 < (signed)strlen(trigger)) && (found));
if (found)
{
// Toggle between un/ignored
if (ignored_block)
unignore_block();
else
ignore_block();
// Get next trigger
get_trigger();
}
}
}
bool parser::get_character(char* result)
{
if ((buffer[head] == EOF) || (buffer[head] == NULL))
return false;
*result = buffer[head];
read_file();
return true;
}
// **************** Parser ******************
ignore_type parser::get_table_opts(unsigned int index)
{
table_opts* table;
if (pg_table_opts == NULL) return Display;
pg_table_opts -> move_first();
table = pg_table_opts -> get_item();
while (table != NULL)
{
if ( (table -> table_no == index)
|| ( (table -> ignore_range)
&& (index >= table -> table_no)
&& (index <= table -> table_range_end)
)
)
return table -> ignore_table;
table = pg_table_opts -> get_item();
}
return Display;
}
void parser::ignore_section()
{
ignored_section = true;
if (supervisor)
destination -> write("</FONT><FONT SIZE=\"2\"><I>");
else
destination -> ignored_section = true;
}
void parser::unignore_section()
{
ignored_section = false;
if (supervisor)
{
destination -> write("</FONT></I><FONT SIZE=\"");
destination -> write ( font_size );
destination -> write("\">");
}
else
destination -> ignored_section = false;
}
void parser::ignore_block()
{
ignored_block = true;
if (supervisor)
{
destination -> write("<BR><B>Start of Ignored Block</B><BR>");
destination -> write("</FONT><FONT SIZE=\"2\"><I>");
}
else
destination -> ignored_section = true;
}
void parser::unignore_block()
{
ignored_block = false;
if (supervisor)
{
destination -> write("</FONT></I><FONT SIZE=\"");
destination -> write ( font_size );
destination -> write("\"><BR><B>End of Ignored Block</B><BR>");
}
else
{
destination -> write("<BR>");
destination -> ignored_section = false;
}
}
bool parser::parse( char* fname_in, char* fname_out, char* url, char* description,
tableoptslist* pg_table_details, bool subpage, char* size,
blockoptslist* pg_block_details)
{
// Open files
source = new fileaccess( fname_in, "r", settings );
destination = new fileaccess( fname_out, "w", settings );
if ( (source -> error) || (destination -> error) )
return false;
// Set initial values for variables
is_subpage = subpage;
buffer_init = false;
in_link = false;
frame_index = 1;
table_number = 1;
table_ignoring = -1;
ignored_section = false;
ignored_block = false;
stack_index = 0;
link_no = 1;
pg_table_opts = pg_table_details;
pg_block_opts = pg_block_details;
strcpy(font_size, size);
// Write HTML headers
destination -> writeheaders(url, description);
// Setup the block ignore policies
setup_blocks();
// Transfer control to first state
ST_text();
// Make sure you do not ignore the footer!
unignore_block();
// Write HTML footers
destination -> writefooters(url);
// Close files
delete source;
delete destination;
return true;
}
// States:
bool parser::ST_text() // Initial state - Main text of page
{
char current_lexeme[BUFFER_SIZE];
do
{
// Call to lexical analyser
if (!get_lexeme(current_lexeme))
return false; // EOF
// Special character Non-Terminal symbol
if (current_lexeme[0] == '&') { if (!ST_special()) return false; }
// Tag Non-Terminal symbol
else if (current_lexeme[0] == '<') { if (!ST_utag()) return false; }
// Normal text
else
{
// Output character
destination -> write ( current_lexeme );
destination -> write ( ' ' );
}
} while (true);
return true;
}
bool parser::ST_utag() // Unknown Tag state
{
char current_lexeme[BUFFER_SIZE]; // Lexeme currently being processed
if (!get_lexeme(current_lexeme))
return false; // EOF
if ( !stricmp(current_lexeme, "TITLE") ) { if(!ST_title()) return false;}
if ( !stricmp(current_lexeme, "BR") ) { if(!ST_break()) return false;}
if ( !stricmp(current_lexeme, "!--") ) { if(!ST_comment()) return false;}
if ( !stricmp(current_lexeme, "SCRIPT") ) { if(!ST_script()) return false;}
if ( !stricmp(current_lexeme, "STYLE") ) { if(!ST_style()) return false;}
if ( !stricmp(current_lexeme, "CAPTION") ) { if(!ST_caption()) return false;}
if ( !stricmp(current_lexeme, "TABLE") ) { if(!ST_table()) return false;}
if ( !stricmp(current_lexeme, "/TABLE") ) { if(!ST_tableend()) return false;}
if ( !stricmp(current_lexeme, "TR") ) { if(!ST_tablerow()) return false;}
if ( !stricmp(current_lexeme, "TD") ) { if(!ST_tablecol()) return false;}
if ( !stricmp(current_lexeme, "TH") ) { if(!ST_tablecol()) return false;}
if ( !stricmp(current_lexeme, "FRAME") ) { if(!ST_frame()) return false;}
if ( !stricmp(current_lexeme, "NOFRAMES") ){ if(!ST_noframes()) return false;}
if ( !stricmp(current_lexeme, "FRAMESET") ){ if(!ST_frameset()) return false;}
if ( !stricmp(current_lexeme, "/A") ) { if(!ST_linkend()) return false;}
if ( !stricmp(current_lexeme, "A") )
{ if(!ST_link()) return false; else return true; }
if ( !stricmp(current_lexeme, "IMG") )
{ if(!ST_image()) return false; else return true; }
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
} while (current_lexeme[0] != '>');
return true;
}
bool parser::ST_special() // Special character
{
char current_lexeme[BUFFER_SIZE]; // Current lexeme
char prev_lexeme[BUFFER_SIZE]; // Previous lexeme
int loop;
if (!get_lexeme(prev_lexeme))
return false; // EOF
if (prev_lexeme[0] == '<')
{
destination -> write("and ");
if (!ST_utag())
return false;
else
return true;
}
if (!get_lexeme(current_lexeme))
return false; // EOF
if (current_lexeme[0] == '<')
{
destination -> write("and ");
destination -> write(prev_lexeme);
destination -> write(' ');
if (!ST_utag())
return false;
else
return true;
}
else if ((current_lexeme[0] != ';'))// Not a special character after all!
{
destination -> write("and ");
destination -> write(prev_lexeme);
destination -> write(' ');
destination -> write(current_lexeme);
destination -> write(' ');
return true;
}
loop = 0;
do
{
// Check through the list of special characters to see if it exists
if ( (!strcmp(prev_lexeme, SpecialChars[loop].NE) )
|| ( (prev_lexeme[0] == '#') && (atoi(&prev_lexeme[1]) == SpecialChars[loop].code) )
)
{
if ( SpecialChars[loop].text != NULL )
{
destination -> write( "<B>" );
destination -> write( SpecialChars[loop].text );
destination -> write( "</B>" );
destination -> write(' ');
}
else
{
destination -> write( (char)SpecialChars[loop].code );
destination -> write(' ');
}
}
loop++;
} while (SpecialChars[loop].NE != NULL);
return true;
}
bool parser::ST_break() // Line break
{
destination -> write("<BR>");
return true;
}
bool parser::ST_linkend() // End of link
{
in_link = false;
destination -> write("</A>");
return true;
}
bool parser::ST_image() // Image
{
char current_lexeme[BUFFER_SIZE]; // Lexeme currently being processed
bool has_alt = false;
char alt_text[MAXLEN_ALT];
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
if ( !stricmp(current_lexeme, "ALT") )
{
has_alt = true;
if(!ST_alt(alt_text)) return false;
destination -> write(alt_text);
}
} while (current_lexeme[0] != '>');
// If in link, showing externs, and no ALT tag, then insert "Image"
if ((show_extern) && (in_link) && ((!has_alt) || (alt_text[0] == NULL)))
destination -> write(" <B>Image</B> ");
return true;
}
bool parser::ST_alt(char* text) // Alternative textual description
{
char current_lexeme[BUFFER_SIZE]; // Lexeme currently being processed
bool first_char = true; // Flag indicating first character in alt text
if (!get_lexeme(current_lexeme))
return false; // EOF
if ( current_lexeme[0] != '=' ) // Unexpected symbol found, exit
return true;
if (!get_lexeme(current_lexeme))
return false; // EOF
if ( current_lexeme[0] != '\"' ) // Alt text has no quotes around it
{
strcpy(text, " <B>");
strcat(text, "\"</B>");
if (strlen(current_lexeme) < MAXLEN_ALT - 19)
strcat(text, current_lexeme);
else
strncat(text, current_lexeme, MAXLEN_ALT - 19);
strcat(text, "<B>\"</B> ");
return true;
}
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
if (current_lexeme[0] == '\"')
{
if (!first_char)
strcat(text, "<B>\"</B> ");
else
text[0] = NULL; // Why do people use empty alt tags???
return true;
}
else if (first_char)
{
strcpy(text, " <B>");
strcat(text, "\"</B>");
first_char = false;
}
else
strcat(text, " ");
if (strlen(current_lexeme) + strlen(text) < MAXLEN_ALT - 19)
strcat(text, current_lexeme);
else
strncat(text, current_lexeme, MAXLEN_ALT - strlen(text) - 19);
} while (true);
return true;
}
bool parser::ST_script() // Script in page - remove completely
{
char current_lexeme[BUFFER_SIZE]; // Lexeme currently being processed
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
} while (stricmp(current_lexeme, "/SCRIPT"));
return true;
}
bool parser::ST_style() // Style sheets in page - remove completely
{
char current_lexeme[BUFFER_SIZE]; // Lexeme currently being processed
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
} while (stricmp(current_lexeme, "/STYLE"));
return true;
}
bool parser::ST_table() // Table
{
char blank[5];
int loop;
if (!ignored_block)
{
table_stack[stack_index] = table_number; // Remember for table end
switch (get_table_opts(table_number))
{
case Display: // Table should just be printed normally
destination -> write("<BR>");
if (supervisor)
{
destination -> write("<B>Table ");
if (stack_index > 0) // Print list of nestings to ease setting up
{
destination -> write("( ");
for (loop = 0; loop<stack_index; loop++)
{
destination -> write(itoa(table_stack[loop], blank, 10));
if (loop<stack_index-1)
destination -> write(" -> ");
}
destination -> write(" ) -> ");
}
destination -> write(itoa(table_number, blank, 10));
destination -> write(" Not announced</B>");
destination -> write("<BR>");
}
break;
case Ignore: // Table should be ignored
if (!ignored_section)
{
table_ignoring = table_number;
ignore_section();
}
if (supervisor)
{
destination -> write("<BR>");
destination -> write("<B>Table ");
if (stack_index > 0) // Print list of nestings to ease setting up
{
destination -> write("( ");
for (loop = 0; loop<stack_index; loop++)
{
destination -> write(itoa(table_stack[loop], blank, 10));
if (loop<stack_index-1)
destination -> write(" -> ");
}
destination -> write(" ) -> ");
}
destination -> write(itoa(table_number, blank, 10));
destination -> write(" Ignored.</B> ");
}
else
destination -> write("<BR>");
break;
case Announce: // Table should be announced
destination -> write("<BR>");
destination -> write("<B>Table ");
if (supervisor)
{
if (stack_index > 1) // Print list of nestings to ease setting up
{
destination -> write("( ");
for (loop = 0; loop<stack_index-1; loop++)
{
destination -> write(itoa(table_number, blank, 10));
destination -> write(" -> ");
}
destination -> write(" ) -> ");
}
destination -> write(itoa(table_number, blank, 10));
destination -> write(" Announced ");
}
else
destination -> write("start.</B>");
destination -> write("<BR>");
break;
}
if (stack_index<STACK_DEPTH)
{
table_number++;
stack_index++;
}
}
return true;
}
bool parser::ST_tablerow() // Table Row
{
destination -> write("<BR>");
return true;
}
bool parser::ST_tableend() // Table End
{
unsigned int index;
char blank[5];
if (!ignored_block)
{
if (stack_index>0)
index = table_stack[--stack_index];
else
return true; // More table ends than starts!
switch(get_table_opts(index))
{
case Announce:
destination -> write("<BR>");
destination -> write("<B>End of table");
if (supervisor)
{
destination -> write(" ");
destination -> write(itoa(index, blank, 10));
destination -> write(" Announced ");
}
destination -> write("</B>");
destination -> write("<BR>");
break;
case Ignore:
if (supervisor)
{
destination -> write("<BR>");
destination -> write("<B>End of table ");
destination -> write(itoa(index, blank, 10));
destination -> write(" Ignored</B> ");
}
break;
case Display:
if (supervisor)
{
destination -> write("<BR>");
destination -> write("<B>End of table ");
destination -> write(itoa(index, blank, 10));
destination -> write("</B> ");
}
}
if (index == table_ignoring)
{
unignore_section();
}
}
return true;
}
bool parser::ST_tablecol() // Table Column
{
destination -> write(" ");
return true;
}
bool parser::ST_caption() // Table caption
{
char current_lexeme[BUFFER_SIZE]; // Lexeme currently being processed
destination -> write( "<B> Caption is: </B>" );
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
} while (current_lexeme[0] != '>');
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
if (current_lexeme[0] != '<')
{
destination -> write( current_lexeme );
destination -> write( ' ' );
}
} while (current_lexeme[0] != '<');
return true;
}
bool parser::ST_title() // Page title
{
char current_lexeme[BUFFER_SIZE]; // Lexeme currently being processed
destination -> write( "<B>Page Title: </B>" );
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
} while (current_lexeme[0] != '>');
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
if (current_lexeme[0] != '<')
{
destination -> write( current_lexeme );
destination -> write( ' ' );
}
} while (current_lexeme[0] != '<');
destination -> write("<BR>");
destination -> write("<BR>");
return true;
}
bool parser::ST_comment() // HTML comments
{
char current_char = NULL;
char prev_char = NULL;
if (supervisor)
destination -> write("<BR><B>COMMENT:</B> <!--");
do
{
prev_char = current_char;
if (!get_character(¤t_char))
return false; // EOF
if (supervisor)
{
while (current_char == '<')
{
destination -> write(">");
if (!get_character(¤t_char))
return false; // EOF
}
if (current_char != NULL)
destination -> write( prev_char );
}
} while (!((current_char == '-') && (prev_char == '-')));
if (supervisor)
destination -> write(" <B>END COMMENT</B><BR>");
return true;
}
bool parser::ST_link() // Link
{
char current_lexeme[BUFFER_SIZE]; // Lexeme currently being processed
char blank[5];
in_link = true;
do
{
if (!get_lexeme(current_lexeme))
return false; // EOF
if ( !stricmp(current_lexeme, "HREF") ){ if(!ST_href()) return false; }
} while (current_lexeme[0] != '>');
if (!ignored_section)
{
if ((!