// Interface to ParseHTML
// Implementation is in ParseHTML.cpp
#ifndef PARSER_DECL
#define PARSER_DECL true
#define BUFFER_SIZE 500 // Constant storing size of buffer
#define STACK_DEPTH 20 // Constant stack size for nested tables
#ifndef GLOBAL_DECL
#include "global.h" // Global data types and variables
#endif
#include <stdio.h>
#include <stdlib.h>
class parser
{
private:
fileaccess* destination; // Pointer to output file object
fileaccess* source; // Pointer to input file object
char last_char; // Last character read
int frame_index; // Index number for frames
bool has_frames; // Page is a frames page
bool in_link; // Flag to store whether currently within a link
bool is_subpage; // Flag to store whether or not this is a sub page
int stack_index; // Index of top of stack
bool supervisor; // User is a supervisor
bool show_extern; // Should the parser display external links as links?
bool ignored_section; // Is the section being ignored (irrespective of supervisor mode)
bool ignored_block; // Is this whole chunk of text being ignored?
char font_size[3]; // Font size for HTML files
char trigger[MAXLEN_BLOCKTXT]; // Current trigger text to switch between un/ignored
unsigned int table_number; // Number of table (facilitates ignoring tables)
unsigned int table_stack[STACK_DEPTH]; // Hopefully they will not be nested too deep!
unsigned int table_ignoring; // Table currently ignoring (copes with nested tables)
unsigned int link_no; // Number of link (used for subpage filtering rules)
getter *parent; // Parent instance of the getter object
config *settings; // Pointer to program settings
tableoptslist *pg_table_opts; // Table options for this page
blockoptslist *pg_block_opts; // Block options for this page
// Variables maintained by the lexical analyser
char buffer[BUFFER_SIZE+1]; // File buffer (circular list in array)
bool buffer_init; // false indicates not yet initialised
int head; // Index of head of circular list
// Private member functions
ignore_type get_table_opts( unsigned int );
void file_output ( char* );
void file_output ( char );
char file_input ( void );
void ignore_section( void );
void unignore_section( void );
void ignore_block( void );
void unignore_block( void );
// Lexical analyser
bool get_lexeme( char* );
void read_file( void );
bool get_character(char* );
void setup_blocks( void );
void get_trigger( void );
// States:
bool ST_text( void );
bool ST_utag( void );
bool ST_special( void );
bool ST_alt( char* );
bool ST_image( void );
bool ST_break( void );
bool ST_script( void );
bool ST_style( void );
bool ST_table( void );
bool ST_caption( void );
bool ST_tablerow( void );
bool ST_tablecol( void );
bool ST_tableend( void );
bool ST_comment( void );
bool ST_link( void );
bool ST_href( void );
bool ST_title( void );
bool ST_frame( void );
bool ST_frameset( void );
bool ST_noframes( void );
bool ST_linkend( void );
public:
bool parse( char*, char*, char*, char*, tableoptslist*, bool, char*, blockoptslist* );
parser( getter*, config* ); // Constructor
};
/* The code below is a minor alteration of a section of code copied
* from the Mozilla source code (aka Netscape) and is subject to
* the Netscape Public License Version 1.0 (the "NPL");
* you may not use this code except in compliance with the NPL.
* You may obtain a copy of the NPL at http://www.mozilla.org/NPL/
* The Initial Developer of this code under the NPL is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
* Reserved.
* Please note that this is only a small subset of all the possible
* entity values. There are actually thousands of them for different
* character sets. I have used the Netscape basic set as a starting point,
* but other values can easily be added below.
*/
typedef struct PA_N2U_struct {
char* NE; /* Name Entity, "copy" */
int len; /* 4, the length of NE */
int code; /* 0x00a4 - ASCII code of character in Hex format */
char* text; /* Added for Web News Speak - textual description of character */
} PA_N2U; /* Name->Unicode */
/* NE LEN ASCIICODE DESCRIPTION */
static PA_N2U SpecialChars[] = {
{"lt", 2, 0x003c, "less than" },
{"LT", 2, 0x003c, "less than" },
{"gt", 2, 0x003e, "greater than" },
{"GT", 2, 0x003e, "greater than" },
{"amp", 3, 0x0026, "ampersand" },
{"AMP", 3, 0x0026, "ampersand" },
{"quot", 4, 0x0022, NULL },
{"QUOT", 4, 0x0022, NULL },
{"nbsp", 4, 0x00a0, " " },
{"reg", 3, 0x00ae, "registered" },
{"REG", 3, 0x00ae, "registered" },
{"copy", 4, 0x00a9, "copyright" },
{"COPY", 4, 0x00a9, "copyright" },
{"iexcl", 5, 0x00a1, "inverted exclamation mark" },
{"cent", 4, 0x00a2, "cent" },
{"pound", 5, 0x00a3, "pound" },
{"curren", 6, 0x00a4, NULL },
{"yen", 3, 0x00a5, "yen"},
{"brvbar", 6, 0x00a6, "bar" },
{"sect", 4, 0x00a7, NULL },
{"uml", 3, 0x00a8, NULL },
{"ordf", 4, 0x00aa, NULL },
{"laquo", 5, 0x00ab, NULL },
{"not", 3, 0x00ac, "not" },
{"shy", 3, 0x00ad, "minus" },
{"macr", 4, 0x00af, NULL },
{"deg", 3, 0x00b0, "degrees" },
{"plusmn", 6, 0x00b1, "plus/minus" },
{"sup2", 4, 0x00b2, "superscript 2" },
{"sup3", 4, 0x00b3, "superscript 3" },
{"acute", 5, 0x00b4, "acute" },
{"micro", 5, 0x00b5, "micro" },
{"para", 4, 0x00b6, "paragraph" },
{"middot", 6, 0x00b7, "dot" },
{"cedil", 5, 0x00b8, NULL },
{"sup1", 4, 0x00b9, "superscript 1" },
{"ordm", 4, 0x00ba, NULL },
{"raquo", 5, 0x00bb, NULL },
{"frac14", 6, 0x00bc, "quarter" },
{"frac12", 6, 0x00bd, "half" },
{"frac34", 6, 0x00be, "three quarters" },
{"iquest", 6, 0x00bf, "inverted question mark" },
{"Agrave", 6, 0x00c0, NULL },
{"Aacute", 6, 0x00c1, NULL },
{"Acirc", 5, 0x00c2, NULL },
{"Atilde", 6, 0x00c3, NULL },
{"Auml", 4, 0x00c4, NULL },
{"Aring", 5, 0x00c5, NULL },
{"AElig", 5, 0x00c6, NULL },
{"Ccedil", 6, 0x00c7, NULL },
{"Egrave", 6, 0x00c8, NULL },
{"Eacute", 6, 0x00c9, NULL },
{"Ecirc", 5, 0x00ca, NULL },
{"Euml", 4, 0x00cb, NULL },
{"Igrave", 6, 0x00cc, NULL },
{"Iacute", 6, 0x00cd, NULL },
{"Icirc", 5, 0x00ce, NULL },
{"Iuml", 4, 0x00cf, NULL },
{"ETH", 3, 0x00d0, NULL },
{"Ntilde", 6, 0x00d1, NULL },
{"Ograve", 6, 0x00d2, NULL },
{"Oacute", 6, 0x00d3, NULL },
{"Ocirc", 5, 0x00d4, NULL },
{"Otilde", 6, 0x00d5, NULL },
{"Ouml", 4, 0x00d6, NULL },
{"times", 5, 0x00d7, "times" },
{"Oslash", 6, 0x00d8, NULL },
{"Ugrave", 6, 0x00d9, NULL },
{"Uacute", 6, 0x00da, NULL },
{"Ucirc", 5, 0x00db, NULL },
{"Uuml", 4, 0x00dc, NULL },
{"Yacute", 6, 0x00dd, NULL },
{"THORN", 5, 0x00de, NULL },
{"szlig", 5, 0x00df, NULL },
{"agrave", 6, 0x00e0, NULL },
{"aacute", 6, 0x00e1, NULL },
{"acirc", 5, 0x00e2, NULL },
{"atilde", 6, 0x00e3, NULL },
{"auml", 4, 0x00e4, NULL },
{"aring", 5, 0x00e5, NULL },
{"aelig", 5, 0x00e6, NULL },
{"ccedil", 6, 0x00e7, NULL },
{"egrave", 6, 0x00e8, NULL },
{"eacute", 6, 0x00e9, NULL },
{"ecirc", 5, 0x00ea, NULL },
{"euml", 4, 0x00eb, NULL },
{"igrave", 6, 0x00ec, NULL },
{"iacute", 6, 0x00ed, NULL },
{"icirc", 5, 0x00ee, NULL },
{"iuml", 4, 0x00ef, NULL },
{"eth", 3, 0x00f0, NULL },
{"ntilde", 6, 0x00f1, NULL },
{"ograve", 6, 0x00f2, NULL },
{"oacute", 6, 0x00f3, NULL },
{"ocirc", 5, 0x00f4, NULL },
{"otilde", 6, 0x00f5, NULL },
{"ouml", 4, 0x00f6, NULL },
{"divide", 6, 0x00f7, "divide" },
{"oslash", 6, 0x00f8, NULL },
{"ugrave", 6, 0x00f9, NULL },
{"uacute", 6, 0x00fa, NULL },
{"ucirc", 5, 0x00fb, NULL },
{"uuml", 4, 0x00fc, NULL },
{"yacute", 6, 0x00fd, NULL },
{"thorn", 5, 0x00fe, NULL },
{"yuml", 4, 0x00ff, NULL },
{NULL, 0, 0x0000, NULL },
};
#endif
syntax highlighted by Code2HTML, v. 0.8.11