//#include <Winsock2.h>
#ifndef GLOBAL_DECL
#include "global.h" // Global data types and variables
#endif
getter::getter(wsock* connect, config* setts, fileaccess* lgfile, statuswnd* stat)
{
connection = connect;
settings = setts;
logfile = lgfile;
status = stat;
}
bool getter::RetrieveURL(char* url, char* result, data_item* page_info, unsigned int link_no)
{
char out_path[MAXLEN_LFILENAME]; // Temporary filename storage
char in_path[MAXLEN_LFILENAME]; // Temporary filename storage
bool valid_url; // Flag to store whether or not the url is valid
char fname[MAXLEN_FILENAME]; // Filename of file to be saved
char error[MAXLEN_ERROR]; // Error description
char extension[5]; // File extension of file to be downloaded
char new_url[MAXLEN_URL]; // Newly created URL
char description[MAXLEN_DESCRIPTION+10]; // Description of page
char filter[MAXLEN_FILTER]; // Subpage URL filter
char status_txt[800]; // Status text to send to the status window
bool is_subpage = true; // Is this a subpage?
bool filter_valid; // Does the page match the filter?
char blank[5]; // Used for integer to string conversion
int loop, loop2; // Loop variables
FILE* fp; // File handle used to check if file exists
sub_opts* current_subopts; // Current subpage options
blockoptslist pg_block_options; // Copy of block options list to send to parser
parser* fileparse; // Pointer to an instance of the parser object
// Check if main page - if so direct a pointer at the page's information
if (page_info != NULL)
{
is_subpage = false;
subpage_no = 0;
current_page = page_info;
// Store description for use with sub pages
strcpy(description, page_info -> description);
strcpy(last_description, description);
}
else // Page is a subpage
{
// Restore description from main page
strcpy(description, last_description);
strcat(description, " - Sub Page ");
// Loop through subpage filters
current_page -> sub_options.move_first();
current_subopts = current_page -> sub_options.get_item();
// Check if URL matches one of the url filters
filter_valid = false;
while ((current_subopts != NULL) && (!filter_valid))
{
strcpy(filter, current_subopts -> filter);
if ((filter[0] != NULL) && (strlen(url) > strlen(filter)))
{
for (loop2=0; loop2<(signed)(strlen(url)-strlen(filter)+1); loop2++)
{
if (strnicmp(&(url[loop2]), filter, strlen(filter)) == 0)
filter_valid = true;
}
}
else
filter_valid = true;
// Check if the link number is within the bounds of one of the link limits
if (link_no < current_subopts -> first_link) filter_valid = false;
if (link_no > current_subopts -> last_link) filter_valid = false;
// If suitable filter not found get the next one
if (!filter_valid)
current_subopts = current_page -> sub_options.get_item();
}
}
// Sort out the URL - these can be a real mess!
valid_url = url_splitter(url, new_url, error, is_subpage);
// Generate filenames
strcpy(fname, &new_url[7]);
// Get rid of non-allowed characters
for (loop=0; loop<(signed)strlen(fname); loop++)
if ((fname[loop] == '/') || (fname[loop] == '?') ||
(fname[loop] == ':') || (fname[loop] == '*') ||
(fname[loop] == '?') || (fname[loop] == '\"') ||
(fname[loop] == '>') || (fname[loop] == '|') )
fname[loop] = '.';
strcpy(in_path, "./source pages/");
strcat(in_path, fname);
strcat(in_path, ".htm");
strcpy(out_path, "./converted pages/");
strcat(out_path, fname);
strcat(out_path, ".htm");
// Download has been cancelled!
if (settings -> cancel)
{
logfile -> lock();
logfile -> write("Canc");
logfile -> write( current_page -> description, 25 );
logfile -> write( " = " );
logfile -> write( new_url, 45 );
logfile -> write( " Stopped - Download Cancelled" );
logfile -> unlock();
return false;
}
// Subpage does not match any of the subpage filters - do not download it
if ((is_subpage) && (!filter_valid))
{
strcpy(result, new_url);
logfile -> lock();
logfile -> write( "\nSkip " );
logfile -> write( current_page -> description, 25 );
logfile -> write( " = " );
logfile -> write( new_url, 45 );
logfile -> write( " does not match subpage filters" );
logfile -> unlock();
return false;
}
// Check if file extension is one of the allowed extensions
if (strlen(new_url) > 4)
{
strcpy(extension, &new_url[strlen(new_url) - 4]); // Non-Allowed Extensions:
if ((!stricmp(extension, ".zip") ) ||
(!stricmp(extension, ".bmp") ) ||
(!stricmp(extension, ".jpg") ) ||
(!stricmp(extension, ".gif") ) ||
(!stricmp(extension, ".tif") ) ||
(!stricmp(extension, ".arj") ) ||
(!stricmp(extension, ".tgz") ) ||
(!stricmp(extension, ".tar") ) ||
(!stricmp(extension, ".doc") ) ||
(!stricmp(extension, ".rtf") ) ||
(!stricmp(extension, ".ace") ) )
{
error_page("This page/file had an extension that is not allowed, you can get it at this url:", out_path, new_url);
valid_url = false;
}
}
if (!valid_url) // URL is invalid
{
error_page(error, out_path, new_url);
}
else
{
// Output filename used by calling function to create link to new page
strcpy(result, fname);
strcat(result, ".htm");
// Increment subpage number (purely cosmetic)
if (is_subpage)
subpage_no++;
// Check if page has already been downloaded
fp = fopen( in_path, "r" );
if ( fp == NULL ) // File does not exist - needs downloading
{
// Add page to status window
if (is_subpage)
{
strcat(description, itoa(subpage_no, blank, 10));
}
strcpy(status_txt, "Downloading ");
strcat(status_txt, description);
status -> addline( status_txt );
// Add page to logfile
logfile -> lock();
if (is_subpage)
{
logfile -> write( "\nSP" );
if (subpage_no < 10)
logfile -> write( '0' );
logfile -> write( itoa(subpage_no, blank, 10) );
logfile -> write( ' ' );
logfile -> write( current_page -> description, 25 );
}
else
{
logfile -> write( "\nMain " );
logfile -> write( current_page -> description, 25 );
}
logfile -> write( " = " );
logfile -> write( new_url, 45 );
logfile -> write( " Downloading." );
logfile -> unlock();
// Retrieve the page from the world wide web
if ( !connection -> get_page( new_url, in_path ) )
{
strcpy(error, connection -> error);
error_page(error, out_path, new_url);
}
else
{
// Create an instance of the parser object to parse the page
fileparse = new parser(this, settings);
// Parse the downloaded page
if (is_subpage)
{
// Make a copy of the relevant subpage block options
//pg_block_options = current_subopts -> block_options;
fileparse -> parse(in_path, out_path, new_url, description,
&(current_subopts -> table_options), true,
settings -> font_size,
&(current_subopts -> block_options));
}
else
{
fileparse -> parse(in_path, out_path, new_url, description,
&(current_page -> table_options), false,
settings -> font_size,
&(current_page -> block_options));
}
delete fileparse;
}
}
else
fclose(fp);
}
return true;
}
bool getter::url_splitter(char* url, char* new_url, char* error, bool is_subpage)
{
char *last_slash;
char *first_slash;
char domain[MAXLEN_DOMAIN];
char path[MAXLEN_PATH];
char old_path[MAXLEN_PATH];
char filename[MAXLEN_FILENAME];
bool has_domain = false;
int loop;
// Initialise the variables to blank values
domain[0] = NULL;
path[0] = NULL;
filename[0] = NULL;
old_path[0] = NULL;
first_slash = NULL;
last_slash = NULL;
new_url[0] = NULL;
strcpy(new_url, url);
//IF url STARTS WITH "http://" THEN
if (!strnicmp(url, "http://", 7))
has_domain = true;
else if (!strnicmp(url, "mailto:", 7))
{
strcpy(error,"This is a link to an e-mail address.");
return false;
}
else if (strstr(url, ":") != NULL)
{
strcpy(error,"ERROR - Can only download pages using HTTP.");
return false;
}
if (has_domain)
{
//url := RIGHT OF url AFTER AND NOT INCLUDING "http://"
strcpy(url, &url[7]);
//IF "/" NOT IN url THEN
if (strchr(url, '/') == NULL)
{
//domain := url
if (strlen(url) > MAXLEN_DOMAIN)
{
strcpy(error,"ERROR - Domain Name Too Long.");
return false;
}
strcpy(domain, url);
//path := "/"
strcpy(path, "/");
}
else
{
//domain := LEFT OF url UP TO AND NOT INCLUDING FIRST "/"
loop = 0;
while ((url[loop] != '/') && (loop < MAXLEN_DOMAIN))
domain[loop] = url[loop++];
domain[loop] = NULL;
if (loop == MAXLEN_DOMAIN)
{
strcpy(error,"ERROR - Domain Name Too Long.");
return false;
}
//url := RIGHT OF url AFTER AND INCLUDING FIRST "/"
first_slash = strchr(url, '/');
strcpy(url, first_slash);
//filename := RIGHT OF url AFTER AND NOT INCLUDING LAST "/"
last_slash = strrchr(url, '/');
strcpy(filename, last_slash + 1);
//url := LEFT OF url UP TO AND INCLUDING LAST "/"
url[last_slash - url + 1] = NULL;
//path := url
if (strlen(url) > MAXLEN_PATH)
{
strcpy(error,"ERROR - URL Path Too Long.");
return false;
}
strcpy(path, url);
}
}
else
{
//IF "/" NOT IN url THEN
last_slash = strrchr(url, '/');
if (last_slash == NULL)
{
//path := "/"
strcpy(path, "/");
//filename := url
if (strlen(url) > MAXLEN_FILENAME)
{
strcpy(error,"ERROR - URL Filename Too Long.");
return false;
}
strcpy(filename, url);
}
else
{
//filename := RIGHT OF url AFTER AND NOT INCLUDING LAST "/"
if (strlen(last_slash + 1) > MAXLEN_FILENAME)
{
strcpy(error,"ERROR - URL Filename Too Long.");
return false;
}
strcpy(filename, last_slash + 1);
//url := LEFT OF url UP TO AND INCLUDING LAST "/"
url[last_slash - url + 1] = NULL;
//path := url
if (strlen(url) > MAXLEN_PATH)
{
strcpy(error,"ERROR - URL Path Too Long.");
return false;
}
strcpy(path, url);
}
}
// Insert missing bits
if ((domain[0] == NULL) && (is_subpage)) // Relative link to a subpage
{
// Use stored domain
strcpy(domain, stored_domain);
if ((path[0] != '/') || ((path[0] == '/') && (path[1] == NULL)) )
{
// Use stored path
strcpy(old_path, stored_path);
// Combine new path with stored path
if (strlen(path) + strlen(old_path) > MAXLEN_PATH)
{
strcpy(error,"ERROR - URL Path Too Long.");
return false;
}
strcat(old_path, path);
strcpy(path, old_path);
}
}
else if (is_subpage) // Absolute link to subpage
{
if (stricmp(domain, stored_domain)) // Sub page is in different domain
{
strcpy(error,"ERROR - This sub page was not within the domain of the main page.");
return false;
}
}
else if ((domain[0] == NULL) && (!is_subpage)) // Relative link to main page??!?
{
strcpy(error,"ERROR - The URL specified is invalid.");
return false;
}
else // Absolute link to main page
{
strcpy(stored_domain, domain); // Store domain
strcpy(stored_path, path); // Store path
}
// Re-construct filename from domain-path-filename parts
if (7 + strlen(domain) + strlen(path) + strlen(filename) > MAXLEN_URL)
{
strcpy(error,"ERROR - URL Too Long.");
return false;
}
strcpy(new_url, "http://" );
strcat(new_url, domain );
strcat(new_url, path );
strcat(new_url, filename );
return true;
}
void getter::error_page(char* error_desc, char* fname, char* url)
{
fileaccess outfile( fname, "w", settings );
if ( !outfile.error )
{
// Write HTML headers
outfile.write( "<HTML>\n" );
outfile.write( "<BODY>\n<TITLE>ERROR</TITLE>\n<FONT SIZE=\"" );
outfile.write( settings -> font_size );
outfile.write( "\">" );
// Write error description and link
outfile.write( error_desc );
outfile.write( "<BR><BR>The URL was: <A HREF=\"" );
outfile.write( url );
outfile.write( "\">" );
outfile.write( url );
outfile.write( "</A>" );
// Write HTML footers
outfile.write ( "\n<BR><BR><A HREF=\"javascript:history.back(1)\">Go back to previous page</A><BR>" );
outfile.write( "<A HREF=\"mailto:webnewsspeak@cc.umist.ac.uk?subject=" );
outfile.write( "Page or link error\">Report errors or dead links</A>\n" );
outfile.write( "</FONT></B>\n" );
outfile.write( "</BODY>\n" );
outfile.write( "</HTML>" );
}
logfile -> lock();
logfile -> write( "\nERR! " );
logfile -> write( current_page -> description, 25 );
logfile -> write( " = " );
logfile -> write( url, 45 );
logfile -> write( " " );
logfile -> write( error_desc );
logfile -> unlock();
}
syntax highlighted by Code2HTML, v. 0.8.11