// Implementation of StripTags
#include <iostream.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "striptags.h"
bool striptag::strip_tags( char* fname_in, char* fname_out, char* url, char* title, char* size )
{
char buffer[10]; // Character buffer
char caps_buffer[10]; // Capitalised buffer
FILE *fp_in; // File pointers
// Initalise variables
last_printed = (char)32;
chars_in_buffer = 0;
no_links = 0;
// Initalise state flags
is_link = false;
is_link_text = false;
is_table = false;
is_tag = false;
is_alt = false;
is_alt_text = false;
is_target = false;
is_target_text = false;
is_special = false;
is_script = false;
is_comment = false;
// Open files
fp_in = fopen( fname_in, "r" );
if ( fp_in == NULL )
{
cout << "Input file not found\n";
return false;
}
fp_out = fopen( fname_out, "w" );
if ( fp_out == NULL )
{
cout << "Cannot open output file\n";
return false;
}
// Write HTML headers
striptag::file_output ( "<!-- " );
striptag::file_output ( url );
striptag::file_output ( " -->\n" );
striptag::file_output ( "<HTML>\n" );
striptag::file_output ( "<BODY>\n<TITLE>" );
striptag::file_output ( title );
striptag::file_output ( "</TITLE>\n<FONT SIZE=\"" );
striptag::file_output ( size );
striptag::file_output ( "\">" );
striptag::file_output ( title );
striptag::file_output ( "\n<BR><BR>\n" );
// Clear buffer
for ( loop = 0; loop < 10; loop++ )
buffer[loop] = NULL;
// Loop through file
do
{
// Prepare buffer for new character
for ( loop = 0; loop < 8; loop++ )
buffer[loop] = buffer[loop+1];
// Read character from input file into buffer
if ( !feof ( fp_in ) )
{
buffer[8] = fgetc( fp_in );
if ( chars_in_buffer < 9 )
chars_in_buffer++;
}
else
chars_in_buffer--;
// Create copy of buffer containing upper case characters
for ( loop = 0; loop < 9; loop++ )
caps_buffer[loop] = toupper(buffer[loop]);
// Attempt to recognise beginnings of sections
striptag::start_symbols ( caps_buffer );
// Output printable text
striptag::output_text ( buffer );
// Attempt to recognide ends of sections
striptag::ending_symbols ( caps_buffer );
} while ( chars_in_buffer > 0 ); // Loop until EOF and buffer empty
// Output a list of links
/*striptag::file_output ( "<BR><BR>Links:<BR>" );
for ( loop = 1; loop < no_links+1; loop++ )
{
striptag::file_output ( itoa ( loop, temp, 10 ) );
striptag::file_output ( ": " );
striptag::file_output ( link[loop] );
striptag::file_output ( "<BR>" );
}*/
// Write HTML footers
striptag::file_output ( "\n<BR><BR><A HREF=\"" );
striptag::file_output ( url );
striptag::file_output ( "\">Go to the actual page on the web</A><BR>\n" );
striptag::file_output ( "<A HREF=\"mailto:webnewsspeak@cc.umist.ac.uk?subject=" );
striptag::file_output ( "Page or link error\">Report errors or dead links</A>\n" );
striptag::file_output ( "</FONT>\n" );
striptag::file_output ( "</BODY>\n" );
striptag::file_output ( "</HTML>" );
// Close files
fclose ( fp_in );
fclose ( fp_out );
return true;
}
void striptag::start_symbols ( char* buffer )
{
// Attempt to recognise starting symbols
if ( buffer[0] == '<' )
is_tag = true;
if ( buffer[0] == '&' )
is_special = true;
if ( !strncmp( buffer, "<SCRIPT", 7 ) )
is_script = true;
if ( !strncmp( buffer, "<!--", 4 ) )
is_comment = true;
if ( ( buffer[0] == 34 ) && ( is_alt ) )
is_alt_text = true;
// Ampersand character
if ( !strncmp( buffer, "&", 5 ) )
{
striptag::file_output ( '&' );
last_printed = '&';
}
// Copyright symbol character
if ( !strncmp( buffer, "©", 5 ) )
{
striptag::file_output ( "Copyright" );
last_printed = 'C';
}
// Non-breaking space character
if ( !strncmp( buffer, "&NBSP;", 6 ) )
{
if ( ( last_printed != ' ' ) && ( last_printed != '\n' ) )
{
striptag::file_output ( ' ' );
last_printed = ' ';
}
}
// Tags interpreted as a newline
if ( ( !strncmp( buffer, "<HR>", 4 ) ) || ( !strncmp( buffer, "<BR>", 4 ) ) ||
( !strncmp( buffer, "<P>", 3 ) ) || ( !strncmp( buffer, "</TITLE>", 8 ) ) ||
( !strncmp( buffer, "<DD>", 4 ) ) || ( !strncmp( buffer, "<DT>", 4 ) ) ||
( !strncmp( buffer, "<LI>", 4 ) ) || ( !strncmp( buffer, "<TR", 3 ) ) )
{
if ( last_printed != '\n' )
{
striptag::file_output ( "<BR>" );
last_printed = (char)'\n';
}
}
if ( !strncmp( buffer, "<A ", 3 ) )
is_link = true;
if ( !strncmp( buffer, "ALT", 3 ) )
is_alt = true;
if ( !strncmp( buffer, "</A", 3 ) )
{
is_link = false;
is_link_text = false;
link[no_links][loop2] = NULL;
}
}
void striptag::output_text ( char* buffer )
{
// Store link text
if ( ( !is_tag ) && ( is_link_text ) )
link[no_links][loop2++] = buffer[0];
// Output printable text
if ( ( ( !is_tag ) && ( !is_special ) ) || ( ( is_alt_text ) && ( buffer[0] != 34 ) ) )
{
// If normal printable non-space character
if ( buffer[0] > 32 )
{
striptag::file_output ( buffer[0] );
last_printed = buffer[0];
}
// Else if not following space or newline character
else if ( ( last_printed != ' ' ) && ( last_printed != '\n' ) )
{
// If character is a space or newline
if ( ( buffer[0] == ' ' ) || ( buffer[0] == 10 ) || ( buffer[0] == 13 ) )
{
striptag::file_output ( ' ' );
last_printed = ' ';
}
}
}
}
void striptag::ending_symbols ( char* buffer )
{
// Attempt to recognise ending symbols
if ( ( is_script ) || ( is_comment ) )
{
if ( !strncmp( buffer, "-->", 3 ) )
is_comment = false;
if ( !strncmp( buffer, "/SCRIPT>", 8 ) )
is_script = false;
}
else if ( buffer[0] == '>' )
{
is_tag = false;
if ( is_link )
{
is_link_text = true;
no_links++;
loop2 = 0;
}
}
else if ( ( buffer[1] == 34 ) && ( is_alt_text ) )
{
is_alt = false;
is_alt_text = false;
}
if ( ( buffer[0] == ';' ) && ( is_special ) )
is_special = false;
}
void striptag::file_output ( char* string ) // Overloaded function
{
int n;
for ( n=0; n<(signed)strlen( string ); n++ )
fputc ( string[n], fp_out );
}
void striptag::file_output ( char character ) // Overloaded function
{
fputc ( character, fp_out );
}
syntax highlighted by Code2HTML, v. 0.8.11