// Interface to ParseHTML


// Implementation is in ParseHTML.cpp

#ifndef PARSER_DECL


#define PARSER_DECL true


#define BUFFER_SIZE 500         // Constant storing size of buffer

#define STACK_DEPTH 20          // Constant stack size for nested tables


#ifndef GLOBAL_DECL

#include "global.h"     // Global data types and variables

#endif


#include <stdio.h>

#include <stdlib.h>



class parser
{
private:

    fileaccess* destination;    // Pointer to output file object

    fileaccess* source;         // Pointer to input file object

    
    char last_char;             // Last character read

    int  frame_index;           // Index number for frames

    bool has_frames;            // Page is a frames page

    bool in_link;               // Flag to store whether currently within a link

    bool is_subpage;            // Flag to store whether or not this is a sub page

    int  stack_index;           // Index of top of stack

    bool supervisor;            // User is a supervisor

    bool show_extern;           // Should the parser display external links as links?

    bool ignored_section;       // Is the section being ignored (irrespective of supervisor mode)

    bool ignored_block;         // Is this whole chunk of text being ignored?

    char font_size[3];          // Font size for HTML files

    
    char trigger[MAXLEN_BLOCKTXT];          // Current trigger text to switch between un/ignored

    unsigned int table_number;              // Number of table (facilitates ignoring tables)

    unsigned int table_stack[STACK_DEPTH];  // Hopefully they will not be nested too deep!

    unsigned int table_ignoring;            // Table currently ignoring (copes with nested tables)

    unsigned int link_no;                   // Number of link (used for subpage filtering rules)


    getter        *parent;          // Parent instance of the getter object

    config        *settings;        // Pointer to program settings

    tableoptslist *pg_table_opts;   // Table options for this page

    blockoptslist *pg_block_opts;   // Block options for this page

    
    // Variables maintained by the lexical analyser

    char buffer[BUFFER_SIZE+1]; // File buffer (circular list in array)

    bool buffer_init;           // false indicates not yet initialised

    int  head;                  // Index of head of circular list


    
    // Private member functions

    ignore_type get_table_opts( unsigned int );
    void file_output ( char* );
    void file_output ( char );
    char file_input ( void );

    void ignore_section( void );
    void unignore_section( void );
    void ignore_block( void );
    void unignore_block( void );

    // Lexical analyser

    bool get_lexeme( char* );
    void read_file( void );
    bool get_character(char* );

    void setup_blocks( void );
    void get_trigger( void );

    // States:

    bool ST_text( void );
    bool ST_utag( void );
    bool ST_special( void );
    bool ST_alt( char* );
    bool ST_image( void );
    bool ST_break( void );
    bool ST_script( void );
    bool ST_style( void );
    bool ST_table( void );
    bool ST_caption( void );
    bool ST_tablerow( void );
    bool ST_tablecol( void );
    bool ST_tableend( void );
    bool ST_comment( void );
    bool ST_link( void );
    bool ST_href( void );
    bool ST_title( void );
    bool ST_frame( void );
    bool ST_frameset( void );
    bool ST_noframes( void );
    bool ST_linkend( void );

public:

    bool parse( char*, char*, char*, char*, tableoptslist*, bool, char*, blockoptslist* );
    
    parser( getter*, config* );         // Constructor

};


/* The code below is a minor alteration of a section of code copied
 * from the Mozilla source code (aka Netscape) and is subject to
 * the Netscape Public License Version 1.0 (the "NPL");
 * you may not use this code except in compliance with the NPL.
 * You may obtain a copy of the NPL at http://www.mozilla.org/NPL/

 * The Initial Developer of this code under the NPL is Netscape
 * Communications Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
 * Reserved.

 * Please note that this is only a small subset of all the possible
 * entity values. There are actually thousands of them for different
 * character sets. I have used the Netscape basic set as a starting point,
 * but other values can easily be added below.
 */


typedef struct PA_N2U_struct {
        char* NE;             /* Name Entity, "copy" */
        int   len;            /* 4, the length of NE */
        int   code;           /* 0x00a4 - ASCII code of character in Hex format */
        char* text;           /* Added for Web News Speak - textual description of character */
} PA_N2U;                     /* Name->Unicode */


/*  NE         LEN ASCIICODE DESCRIPTION  */
static PA_N2U SpecialChars[] = {
    {"lt",      2,  0x003c, "less than" },
    {"LT",      2,  0x003c, "less than" },
    {"gt",      2,  0x003e, "greater than" },
    {"GT",      2,  0x003e, "greater than" },
    {"amp",     3,  0x0026, "ampersand" },
    {"AMP",     3,  0x0026, "ampersand" },
    {"quot",    4,  0x0022, NULL },
    {"QUOT",    4,  0x0022, NULL },
    {"nbsp",    4,  0x00a0, " " },
    {"reg",     3,  0x00ae, "registered" },
    {"REG",     3,  0x00ae, "registered" },
    {"copy",    4,  0x00a9, "copyright" },
    {"COPY",    4,  0x00a9, "copyright" },

    {"iexcl",   5,  0x00a1, "inverted exclamation mark" },
    {"cent",    4,  0x00a2, "cent" },
    {"pound",   5,  0x00a3, "pound" },
    {"curren",  6,  0x00a4, NULL },
    {"yen",     3,  0x00a5, "yen"},
    {"brvbar",  6,  0x00a6, "bar" },
    {"sect",    4,  0x00a7, NULL },

    {"uml",     3,  0x00a8, NULL },
    {"ordf",    4,  0x00aa, NULL },
    {"laquo",   5,  0x00ab, NULL },
    {"not",     3,  0x00ac, "not" },
    {"shy",     3,  0x00ad, "minus" },
    {"macr",    4,  0x00af, NULL },

    {"deg",     3,  0x00b0, "degrees" },
    {"plusmn",  6,  0x00b1, "plus/minus" },
    {"sup2",    4,  0x00b2, "superscript 2" },
    {"sup3",    4,  0x00b3, "superscript 3" },
    {"acute",   5,  0x00b4, "acute" },
    {"micro",   5,  0x00b5, "micro" },
    {"para",    4,  0x00b6, "paragraph" },
    {"middot",  6,  0x00b7, "dot" },

    {"cedil",   5,  0x00b8, NULL },
    {"sup1",    4,  0x00b9, "superscript 1" },
    {"ordm",    4,  0x00ba, NULL },
    {"raquo",   5,  0x00bb, NULL },
    {"frac14",  6,  0x00bc, "quarter" },
    {"frac12",  6,  0x00bd, "half" },
    {"frac34",  6,  0x00be, "three quarters" },
    {"iquest",  6,  0x00bf, "inverted question mark" },

    {"Agrave",  6,  0x00c0, NULL },
    {"Aacute",  6,  0x00c1, NULL },
    {"Acirc",   5,  0x00c2, NULL },
    {"Atilde",  6,  0x00c3, NULL },
    {"Auml",    4,  0x00c4, NULL },
    {"Aring",   5,  0x00c5, NULL },
    {"AElig",   5,  0x00c6, NULL },
    {"Ccedil",  6,  0x00c7, NULL },

    {"Egrave",  6,  0x00c8, NULL },
    {"Eacute",  6,  0x00c9, NULL },
    {"Ecirc",   5,  0x00ca, NULL },
    {"Euml",    4,  0x00cb, NULL },
    {"Igrave",  6,  0x00cc, NULL },
    {"Iacute",  6,  0x00cd, NULL },
    {"Icirc",   5,  0x00ce, NULL },
    {"Iuml",    4,  0x00cf, NULL },

    {"ETH",     3,  0x00d0, NULL },
    {"Ntilde",  6,  0x00d1, NULL },
    {"Ograve",  6,  0x00d2, NULL },
    {"Oacute",  6,  0x00d3, NULL },
    {"Ocirc",   5,  0x00d4, NULL },
    {"Otilde",  6,  0x00d5, NULL },
    {"Ouml",    4,  0x00d6, NULL },
    {"times",   5,  0x00d7, "times" },

    {"Oslash",  6,  0x00d8, NULL },
    {"Ugrave",  6,  0x00d9, NULL },
    {"Uacute",  6,  0x00da, NULL },
    {"Ucirc",   5,  0x00db, NULL },
    {"Uuml",    4,  0x00dc, NULL },
    {"Yacute",  6,  0x00dd, NULL },
    {"THORN",   5,  0x00de, NULL },
    {"szlig",   5,  0x00df, NULL },

    {"agrave",  6,  0x00e0, NULL },
    {"aacute",  6,  0x00e1, NULL },
    {"acirc",   5,  0x00e2, NULL },
    {"atilde",  6,  0x00e3, NULL },
    {"auml",    4,  0x00e4, NULL },
    {"aring",   5,  0x00e5, NULL },
    {"aelig",   5,  0x00e6, NULL },
    {"ccedil",  6,  0x00e7, NULL },

    {"egrave",  6,  0x00e8, NULL },
    {"eacute",  6,  0x00e9, NULL },
    {"ecirc",   5,  0x00ea, NULL },
    {"euml",    4,  0x00eb, NULL },
    {"igrave",  6,  0x00ec, NULL },
    {"iacute",  6,  0x00ed, NULL },
    {"icirc",   5,  0x00ee, NULL },
    {"iuml",    4,  0x00ef, NULL },

    {"eth",     3,  0x00f0, NULL },
    {"ntilde",  6,  0x00f1, NULL },
    {"ograve",  6,  0x00f2, NULL },
    {"oacute",  6,  0x00f3, NULL },
    {"ocirc",   5,  0x00f4, NULL },
    {"otilde",  6,  0x00f5, NULL },
    {"ouml",    4,  0x00f6, NULL },
    {"divide",  6,  0x00f7, "divide" },

    {"oslash",  6,  0x00f8, NULL },
    {"ugrave",  6,  0x00f9, NULL },
    {"uacute",  6,  0x00fa, NULL },
    {"ucirc",   5,  0x00fb, NULL },
    {"uuml",    4,  0x00fc, NULL },
    {"yacute",  6,  0x00fd, NULL },
    {"thorn",   5,  0x00fe, NULL },
    {"yuml",    4,  0x00ff, NULL },

    {NULL,      0,  0x0000, NULL },
};

#endif

syntax highlighted by Code2HTML, v. 0.8.11