// Implementation of StripTags


#include <iostream.h>

#include <stdio.h>

#include <string.h>

#include <stdlib.h>

#include "striptags.h"


bool striptag::strip_tags( char* fname_in, char* fname_out, char* url, char* title, char* size )
{
    char buffer[10];            // Character buffer

    char caps_buffer[10];       // Capitalised buffer

    FILE *fp_in;                // File pointers


    // Initalise variables

    last_printed = (char)32;
    chars_in_buffer = 0;
    no_links = 0;
    
    // Initalise state flags

    is_link = false;
    is_link_text = false;
    is_table = false;
    is_tag = false;
    is_alt = false;
    is_alt_text = false;
    is_target = false;
    is_target_text = false;
    is_special = false;
    is_script = false;
    is_comment = false;

    // Open files

    fp_in = fopen( fname_in, "r" );
    if ( fp_in == NULL )
    {
        cout << "Input file not found\n";
        return false;
    }
    
    fp_out = fopen( fname_out, "w" );
    if ( fp_out == NULL )
    {
        cout << "Cannot open output file\n";
        return false;
    }

    // Write HTML headers

    striptag::file_output ( "<!-- " );
    striptag::file_output ( url );
    striptag::file_output ( " -->\n" );
    striptag::file_output ( "<HTML>\n" );
    striptag::file_output ( "<BODY>\n<TITLE>" );
    striptag::file_output ( title );
    striptag::file_output ( "</TITLE>\n<FONT SIZE=\"" );
    striptag::file_output ( size );
    striptag::file_output ( "\">" );
    striptag::file_output ( title );
    striptag::file_output ( "\n<BR><BR>\n" );


    // Clear buffer

    for ( loop = 0; loop < 10; loop++ )
        buffer[loop] = NULL;


    // Loop through file

    do
    {
        // Prepare buffer for new character

        for ( loop = 0; loop < 8; loop++ )
            buffer[loop] = buffer[loop+1];
        
        // Read character from input file into buffer

        if ( !feof ( fp_in ) )
        {
            buffer[8] = fgetc( fp_in );
            if ( chars_in_buffer < 9 )
                chars_in_buffer++;
        }
        else
            chars_in_buffer--;

        // Create copy of buffer containing upper case characters

        for ( loop = 0; loop < 9; loop++ )
            caps_buffer[loop] = toupper(buffer[loop]);

        // Attempt to recognise beginnings of sections

        striptag::start_symbols ( caps_buffer );
        // Output printable text

        striptag::output_text ( buffer );
        // Attempt to recognide ends of sections

        striptag::ending_symbols ( caps_buffer );

    } while ( chars_in_buffer > 0 ); // Loop until EOF and buffer empty

    
    // Output a list of links

    /*striptag::file_output ( "<BR><BR>Links:<BR>" );
    for ( loop = 1; loop < no_links+1; loop++ )
    {
        striptag::file_output ( itoa ( loop, temp, 10 ) );
        striptag::file_output ( ": " );
        striptag::file_output ( link[loop] );
        striptag::file_output ( "<BR>" );
    }*/

    // Write HTML footers

    striptag::file_output ( "\n<BR><BR><A HREF=\"" );
    striptag::file_output ( url );
    striptag::file_output ( "\">Go to the actual page on the web</A><BR>\n" );
    striptag::file_output ( "<A HREF=\"mailto:webnewsspeak@cc.umist.ac.uk?subject=" );
    striptag::file_output ( "Page or link error\">Report errors or dead links</A>\n" );
    striptag::file_output ( "</FONT>\n" );
    striptag::file_output ( "</BODY>\n" );
    striptag::file_output ( "</HTML>" );

    // Close files

    fclose ( fp_in );
    fclose ( fp_out );

    return true;
}

void striptag::start_symbols ( char* buffer )
{
    // Attempt to recognise starting symbols

    if ( buffer[0] == '<' )
        is_tag = true;

    if ( buffer[0] == '&' )
        is_special = true;

    if ( !strncmp( buffer, "<SCRIPT", 7 ) )
        is_script = true;

    if ( !strncmp( buffer, "<!--", 4 ) )
        is_comment = true;

    if ( ( buffer[0] == 34 ) && ( is_alt ) )
        is_alt_text = true;

    // Ampersand character

    if ( !strncmp( buffer, "&AMP;", 5 ) )
    {
        striptag::file_output ( '&' );
        last_printed = '&';
    }
    // Copyright symbol character

    if ( !strncmp( buffer, "&COPY;", 5 ) )
    {
        striptag::file_output ( "Copyright" );
        last_printed = 'C';
    }
    // Non-breaking space character

    if ( !strncmp( buffer, "&NBSP;", 6 ) )
    {
        if ( ( last_printed != ' ' ) && ( last_printed != '\n' ) )
        {
            striptag::file_output ( ' ' );
            last_printed = ' ';
        }
    }
    // Tags interpreted as a newline

    if ( ( !strncmp( buffer, "<HR>", 4 ) ) || ( !strncmp( buffer, "<BR>", 4     ) ) ||
         ( !strncmp( buffer, "<P>", 3  ) ) || ( !strncmp( buffer, "</TITLE>", 8 ) ) ||
         ( !strncmp( buffer, "<DD>", 4  ) ) || ( !strncmp( buffer, "<DT>", 4 ) ) ||
         ( !strncmp( buffer, "<LI>", 4 ) ) || ( !strncmp( buffer, "<TR", 3      ) ) )
    {
        if ( last_printed != '\n' )
        {
            striptag::file_output ( "<BR>" );
            last_printed = (char)'\n';
        }
    }

    if ( !strncmp( buffer, "<A ", 3 ) )
        is_link = true;
    if ( !strncmp( buffer, "ALT", 3 ) )
        is_alt = true;
    if ( !strncmp( buffer, "</A", 3 ) )
    {
        is_link = false;
        is_link_text = false;
        link[no_links][loop2] = NULL;
    }
}

void striptag::output_text ( char* buffer )
{
    // Store link text

    if ( ( !is_tag ) && ( is_link_text ) )
        link[no_links][loop2++] = buffer[0];

    // Output printable text

    if ( ( ( !is_tag ) && ( !is_special ) ) || ( ( is_alt_text ) && ( buffer[0] != 34 ) ) )
    {
        // If normal printable non-space character

        if ( buffer[0] > 32 )
        {
            striptag::file_output ( buffer[0] );
            last_printed = buffer[0];
        }
        // Else if not following space or newline character

        else if ( ( last_printed != ' ' ) && ( last_printed != '\n' ) )
        {
            // If character is a space or newline

            if ( ( buffer[0] == ' ' ) || ( buffer[0] == 10 ) || ( buffer[0] == 13 ) )
            {
                striptag::file_output ( ' ' );
                last_printed = ' ';
            }
        }
    }
}

void striptag::ending_symbols ( char* buffer )
{
    // Attempt to recognise ending symbols

    if ( ( is_script ) || ( is_comment ) )
    {
        if ( !strncmp( buffer, "-->", 3 ) )
            is_comment = false;
        if ( !strncmp( buffer, "/SCRIPT>", 8 ) )
            is_script = false;
    }
    
    else if ( buffer[0] == '>' )
    {
        is_tag = false;
        if ( is_link )
        {
            is_link_text = true;
            no_links++;
            loop2 = 0;
        }
    }
    
    else if ( ( buffer[1] == 34 ) && ( is_alt_text ) )
    {
        is_alt = false;
        is_alt_text = false;
    }

    if ( ( buffer[0] == ';' ) && ( is_special ) )
        is_special = false;
}

void striptag::file_output ( char* string )         // Overloaded function

{
    int n;
    for ( n=0; n<(signed)strlen( string ); n++ )
        fputc ( string[n], fp_out );
}

void striptag::file_output ( char character )       // Overloaded function

{
    fputc ( character, fp_out );
}

syntax highlighted by Code2HTML, v. 0.8.11