// File: news_1.cpp // // Description: This file was derived from the ..\NetMan\test_nm.cpp // test program file. It serves as a minimal example // for using the NetMan C++ class to retrieve HTML // documents from the Internet, and for storing data // on the local machine using C++ Library class. // The Library class uses the full text indexing // class TextIndex to build complete inverted word // indices for the retrieved documents. The C++ // class HTML is used to access the structured text // data in the retrieved HTML documents. // #include // required for all Windows applications #include "resource.h" // Windows resource IDs #include "resrc1.h" #include "test_nm.h" // specific to this program #include "NetMan.h" #include "Library.h" #include "HTML.h" #include "FileDir.h" #include "TextNews.h" #include "RTFNews.h" // Define test data: //char * HTTP_server[] = { // "www.rns.com", "salvador.mt.cs.cmu.edu"}; //char * docs[] = { // "/cgi-bin/nikos?Windows", "/cgi-bin/pursuit.exe?query=AI+NLP"}; //const int num_docs = 2; // The following data will be retrieved from // the configuration file: Robot.CFG char HTTP_server[10][60]; char docs[10][128]; int num_docs; // Set the maximum number of Web documents to be retrieved from // a single information source. NOTE: be a good Internet citezen, // and keep the following parameter set to a low value. If you // make this value very large (e.g., 50), you will retrieve // documents which a low probability of being interesting to // you. When I am testing this program, I set MAX_SEARCH to 5, // and usually set it to 10 or 20 for acutal use of the program. const int MAX_SEARCH = 10; // Allocate a pointer to a Library object: static Library *library; // Define a few "expert system" style rule functions // used by instances of C++ class HTML to search for // valid references to other embedded HTML document // references: int rule_1(char *str) { if (HTML::STR_FOUND("cgi-", str)) return 1; // disallow str return 0; // str is OK } int rule_2(char *str) { if (HTML::STR_FOUND("http:", str)) return 0; // str is OK return 1; // disallow str } // Define a rule to test the file extension of returned // files. int extension_ok(char *file_extension) { // discard *.Z files if (strcmp(file_extension, "Z") == 0) return 0; // discard *.gz files if (strcmp(file_extension, "gz") == 0) return 0; // discard *.zip files if (strcmp(file_extension, "zip") == 0) return 0; // discard *.ZIP files if (strcmp(file_extension, "ZIP") == 0) return 0; // discard *.tar files if (strcmp(file_extension, "tar") == 0) return 0; // discard *.TAR files if (strcmp(file_extension, "TAR") == 0) return 0; return 1; // file extension is OK } HINSTANCE hInst; // current instance HCURSOR hSaveCursor; // handle to current cursor HCURSOR hHourGlass; // handle to hourglass cursor static FARPROC lpfnDefaultEditBox; char szApplicationName[] = "Test_NetMan"; char szWindowTitle[] = "Test C++ NetMan class"; int CALLBACK WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow) { MSG msg; if (!hPrevInstance) { // Initialize application: if (!InitProgram(hInstance)) { return (FALSE); // exit if we can not initialize } } if (!InitInstance(hInstance, nCmdShow)) { return (FALSE); } // Process Window messages: while (GetMessage(&msg, NULL, 0, 0)) { if (!TranslateAccelerator (msg.hwnd, NULL, &msg)) { TranslateMessage(&msg); DispatchMessage(&msg); } } return (msg.wParam); // Returns the value from PostQuitMessage } BOOL InitProgram(HINSTANCE hInstance) // Init wndow data and { // register class WNDCLASS wc; wc.style = CS_HREDRAW | CS_VREDRAW; wc.lpfnWndProc = (WNDPROC)WndProc; wc.cbClsExtra = 0; wc.cbWndExtra = 0; wc.hInstance = hInstance; wc.hIcon = NULL; wc.hCursor = LoadCursor(NULL, IDC_ARROW); wc.hbrBackground = (HBRUSH)(COLOR_WINDOW+1); wc.lpszMenuName = MAKEINTRESOURCE(IDR_GENERIC); wc.lpszClassName = szApplicationName; return (RegisterClass(&wc)); } BOOL InitInstance(HINSTANCE hInstance, int nCmdShow) { HWND hWnd; // main window handle. hInst = hInstance; // save instance handle // from MSVC2.0 cursor.c example program: hHourGlass = LoadCursor(NULL, IDC_WAIT); // create a main window: hWnd = CreateWindow(szApplicationName, szWindowTitle, WS_OVERLAPPEDWINDOW, CW_USEDEFAULT, 0, CW_USEDEFAULT, 0, NULL, NULL, hInstance, NULL); if (!hWnd) { return (FALSE); // error condition } ShowWindow(hWnd, nCmdShow); UpdateWindow(hWnd); return (TRUE); } // Default Dialog Edit box: static char FAR cpEdit[256]; BOOL APIENTRY DefaultEditBox(HWND hWnd, WORD iMessage, WORD wParam, LONG /* lParam */) { switch (iMessage) { case WM_INITDIALOG: return TRUE; case WM_COMMAND: if (wParam == IDOK) { // OK button hit for (int i=0; i<256; i++) cpEdit[i] = '\0'; GetDlgItemText(hWnd, IDC_EDIT1, &(cpEdit[0]), 80); EndDialog(hWnd, 0); return TRUE; } else { cpEdit[0] = '\0'; return FALSE; } default: return FALSE; } } LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM uParam, LPARAM lParam) { int k, wmId, wmEvent; switch (message) { case WM_COMMAND: // message: command from application menu wmId = LOWORD(uParam); wmEvent = HIWORD(uParam); switch (wmId) { case IDM_EXIT: DestroyWindow (hWnd); break; case IDM_FETCH: { FARPROC lpProc; lpProc = MakeProcInstance((FARPROC)DefaultEditBox, hInst); DialogBox(hInst, MAKEINTRESOURCE(IDD_DIALOG1), hWnd, (DLGPROC)DefaultEditBox); FreeProcInstance(lpProc); } if (cpEdit[0] == '\0') break; #ifdef DEBUG MessageBox(NULL, cpEdit, "Selected keywords",MB_OK); #endif { HTML html("Robot.CFG"); char buf[256]; char host[100]; char doc[200]; char ext[12]; num_docs = 0; for (int iter=0; iter<10; iter++) { int k = html.next_tag_text("robot", buf, 256); if (k) // found a robot tag { int found_host_doc = html.get_host_and_doc(buf, host, doc, ext); if (found_host_doc) { sprintf(&(HTTP_server[num_docs][0]), "%s", host); sprintf(&(docs[num_docs][0]), "%s%s", doc, cpEdit); num_docs++; #ifdef DEBUG char buf2[256]; sprintf(buf2, "Robot host=|%s|, doc=|%s%s|", host, doc, cpEdit); MessageBox(NULL, buf2, "Info from Robot.CFG", MB_OK); #endif } } } } // show hour glass cursor: hSaveCursor = SetCursor(hHourGlass); // The following processing will take a while! // (not the best strategy for Windows apps...) // Delete any existing *.ROB files. We use the // extension .ROB for HTML documents requested // from Internet Robots: { FileDir fd(".\\*.ROB"); int num = fd.number_of_matched_files(); for (int i=0; i MAX_FILE_NAMES) { char buf[256]; sprintf(buf, "Too many HTML docs: max is %d (set in TextInd.h)", MAX_FILE_NAMES); MessageBox( NULL, buf, "Too many HTML files", MB_OK); num = MAX_FILE_NAMES; sprintf(buf, "Only %d HTML docs are going to be processed", num); MessageBox(NULL, buf, "Too many HTML files", MB_OK); } char * files[MAX_FILE_NAMES]; for (int i=0; iadd_files(num, files); delete library; // this closes all files for (i=0; iAI_search(num_key_words, k_words); char *files[50]; int num_files = 0; for (int k=0; knumber_of_documents(); k++) { char buf[128]; sprintf(buf,"Matching doc:%s\nRelevance:%f", library->get_document_name(k), library->get_document_relevance(k)); MessageBox( NULL, buf, "Document match", MB_OK); if (library->get_document_relevance(k) > 0.001) { int doc_name_len = strlen(library->get_document_name(k)); files[num_files] = new char[doc_name_len + 1]; sprintf(files[num_files++], "%s", library->get_document_name(k)); } } TextNews(k_words, num_key_words, files, num_files, "out.txt"); RTFNews (k_words, num_key_words, files, num_files, "out.RTF"); for (k=0; k