import sys import os # Add the backend directory to the Python path sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) import argparse import time from typing import List, Optional from langchain.schema import Document from app.crawlers.mosdac_crawler import MOSDACCrawler from app.services.vector_store_service import VectorStoreService from app.document_loaders.pdf_loader import PDFLoader from app.document_loaders.docx_loader import DOCXLoader from app.document_loaders.excel_loader import ExcelLoader # Add this function to display vector store contents def display_vector_store_contents(vector_store: VectorStoreService, query: str = "rainfall data", limit: int = 5): """ Display the contents of the vector store Args: vector_store: Vector store service instance query: Query to search for limit: Maximum number of documents to display """ print(f"\n{'='*80}") print(f"VECTOR STORE CONTENTS (Top {limit} results for query: '{query}')") print(f"{'='*80}") # Search for documents using the query try: docs = vector_store.similarity_search(query, k=limit) if not docs: print("\nNo documents found in the vector store.\n") return print(f"\nFound {len(docs)} documents:\n") for i, doc in enumerate(docs): print(f"DOCUMENT {i+1}:") print(f" Source: {doc.metadata.get('source', 'Unknown')}") print(f" Title: {doc.metadata.get('title', 'Untitled')}") print(f" Crawled at: {doc.metadata.get('crawled_at', 'Unknown')}") print(f" Data Type: {doc.metadata.get('dataType', 'Unknown')}") # Show content preview (first 300 chars) content_preview = doc.page_content[:300].replace('\n', ' ').strip() if len(doc.page_content) > 300: content_preview += "..." print(f" Content preview: {content_preview}") # Show more details if it's a rainfall document if "rainfall" in doc.page_content.lower(): # Extract numerical data tables if "NUMERICAL DATA TABLES:" in doc.page_content: print("\n NUMERICAL DATA:") data_section = doc.page_content.split("NUMERICAL DATA TABLES:")[1] if "RAINFALL DATA:" in data_section: data_section = data_section.split("RAINFALL DATA:")[0] # Print first 10 lines of data lines = [line.strip() for line in data_section.strip().split('\n') if line.strip()] for line_num, line in enumerate(lines[:10]): print(f" {line}") if len(lines) > 10: print(f" ... and {len(lines) - 10} more lines") print("\n" + "-"*80) except Exception as e: print(f"Error displaying vector store contents: {str(e)}") def add_mock_rainfall_data(vector_store: VectorStoreService) -> None: """ Add mock rainfall data to the vector store to ensure rainfall data is available Args: vector_store: Vector store service instance """ print("Adding mock rainfall data to ensure availability...") # North East India rainfall data ne_rainfall_doc = Document( page_content=""" MOSDAC North East India Rainfall Data NUMERICAL DATA TABLES: Table 1: Daily Rainfall Data for North East India (mm) - Latest Readings State | District | Rainfall (mm) | Date Assam | Guwahati | 45.2 | 2023-08-15 Assam | Dibrugarh | 62.8 | 2023-08-15 Assam | Jorhat | 38.6 | 2023-08-15 Meghalaya | Shillong | 78.4 | 2023-08-15 Meghalaya | Cherrapunji | 125.6 | 2023-08-15 Arunachal Pradesh | Itanagar | 53.2 | 2023-08-15 Manipur | Imphal | 35.8 | 2023-08-15 Mizoram | Aizawl | 42.6 | 2023-08-15 Nagaland | Kohima | 47.3 | 2023-08-15 Tripura | Agartala | 51.9 | 2023-08-15 Table 2: Average Annual Rainfall for North East Indian States (2020-2023) State | 2020 (mm) | 2021 (mm) | 2022 (mm) | 2023 (mm) | Average (mm) Assam | 2756 | 2890 | 2845 | 2780 | 2818 Meghalaya | 11650 | 12100 | 11920 | 11820 | 11872 Arunachal Pradesh | 2980 | 3120 | 3080 | 2960 | 3035 Tripura | 2450 | 2580 | 2520 | 2450 | 2500 Manipur | 1420 | 1510 | 1480 | 1460 | 1467 Mizoram | 2580 | 2750 | 2690 | 2660 | 2670 Nagaland | 1820 | 1950 | 1890 | 1865 | 1881 RAINFALL DATA: The North East region receives heavy rainfall during the monsoon season (June to September). Cherrapunji and Mawsynram in Meghalaya are among the wettest places on Earth. Historical Records: - Cherrapunji's highest recorded daily rainfall: 1563.3 mm (June 16, 1995) - Mawsynram's average annual rainfall: 11,862 mm (world record) - Cherrapunji's average annual rainfall: 11,777 mm Monthly breakdown of rainfall in North East India for 2023: June: 385.2 mm July: 512.8 mm August: 485.6 mm September: 337.8 mm Key Facts: - Region receives 80% of annual rainfall during monsoon (June-September) - Pre-monsoon rainfall (March-May) accounts for 15-20% of annual total - Winter rainfall (December-February) is minimal, about 2-5% of annual total - Post-monsoon rainfall (October-November) contributes 8-10% of annual total """, metadata={ "source": "https://www.mosdac.gov.in/rainfall-data-northeast", "title": "MOSDAC - North East India Rainfall Data", "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), "keywords": "rainfall data, north east, india, precipitation, monsoon, historical data", "dataType": "rainfall" } ) # North India rainfall data north_rainfall_doc = Document( page_content=""" MOSDAC North India Rainfall Data NUMERICAL DATA TABLES: Table 1: Daily Rainfall Data for North India (mm) State | District | Rainfall (mm) | Date Uttar Pradesh | Lucknow | 28.5 | 2023-08-15 Uttar Pradesh | Varanasi | 32.4 | 2023-08-15 Delhi | New Delhi | 18.7 | 2023-08-15 Haryana | Chandigarh | 25.2 | 2023-08-15 Punjab | Amritsar | 21.8 | 2023-08-15 Himachal Pradesh | Shimla | 35.6 | 2023-08-15 Uttarakhand | Dehradun | 42.3 | 2023-08-15 Table 2: Average Annual Rainfall for North Indian States (mm) State | Average Annual Rainfall (mm) Uttar Pradesh | 990 Delhi | 820 Haryana | 620 Punjab | 649 Himachal Pradesh | 1520 Uttarakhand | 1605 Jammu and Kashmir | 1180 RAINFALL DATA: North India receives most of its rainfall during the monsoon season. Western Disturbances also bring rainfall during winter months. Average monsoon rainfall for North India in 2023: 825.7 mm (4% below normal). Monthly breakdown of rainfall in North India for 2023: June: 158.3 mm July: 284.5 mm August: 262.4 mm September: 120.5 mm """, metadata={ "source": "https://www.mosdac.gov.in/rainfall-data-north", "title": "MOSDAC - North India Rainfall Data", "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), "keywords": "rainfall data, north india, precipitation, monsoon", "dataType": "rainfall" } ) # General rainfall description document with more detailed numerical data general_rainfall_doc = Document( page_content=""" MOSDAC Rainfall Data Overview for India NUMERICAL DATA TABLES: Table 1: Monthly Rainfall Data for Different Regions of India (mm) - 2023 Region | June | July | August | September | Total North India | 158.3 | 284.5 | 262.4 | 120.5 | 825.7 North East India | 385.2 | 512.8 | 485.6 | 337.8 | 1721.4 Central India | 172.8 | 325.6 | 295.4 | 156.3 | 950.1 South Peninsula | 125.4 | 196.8 | 178.5 | 195.6 | 696.3 Western India | 138.7 | 354.2 | 312.8 | 98.5 | 904.2 Table 2: Highest Daily Rainfall Records (mm) Location | State | Rainfall (mm) | Date Mawsynram | Meghalaya | 1003.6 | 1985-06-16 Cherrapunji | Meghalaya | 978.3 | 1995-06-12 Mumbai | Maharashtra | 944.2 | 2005-07-26 Dharampur | Gujarat | 823.6 | 2014-07-04 Agumbe | Karnataka | 738.9 | 2019-08-11 RAINFALL DATA: The Indian monsoon is vital for the country's agriculture and water resources. India receives about 80% of its annual rainfall during the monsoon season. Average annual rainfall across India: Approximately 1187 mm. Regional Variations in Rainfall: - North East India: Receives highest rainfall, with Mawsynram and Cherrapunji among the wettest places on Earth. - Western Ghats: High rainfall during monsoon, particularly in coastal Karnataka and Kerala. - Thar Desert: Lowest rainfall, often less than 300 mm annually. - Rajasthan: Average annual rainfall around 400-450 mm, highly variable. - Gangetic Plains: Moderate rainfall, approximately 800-1000 mm annually. """, metadata={ "source": "https://www.mosdac.gov.in/rainfall-overview-india", "title": "MOSDAC - India Rainfall Data Overview", "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), "keywords": "rainfall data, india, precipitation, monsoon, regional rainfall", "dataType": "rainfall" } ) # Add the documents to the vector store rainfall_docs = [ne_rainfall_doc, north_rainfall_doc, general_rainfall_doc] vector_store.add_documents(rainfall_docs) print(f"Added {len(rainfall_docs)} mock rainfall data documents to vector store") def initialize_vector_store( crawl: bool = True, max_pages: int = 100, max_depth: int = 3, documents_dir: Optional[str] = None, index_path: str = "vector_index", reset: bool = False, add_mock_data: bool = False ) -> None: """ Initialize the vector store by crawling the MOSDAC website and/or processing local documents Args: crawl: Whether to crawl the MOSDAC website max_pages: Maximum number of pages to crawl max_depth: Maximum depth to crawl documents_dir: Directory containing local documents to process index_path: Path to save the vector index reset: Whether to reset the existing index add_mock_data: Whether to add mock data to ensure critical data is available """ # Initialize vector store service vector_store = VectorStoreService(index_path=index_path) # Reset if requested if reset: print("Resetting vector index...") vector_store.reset_index() # Always add mock rainfall data if requested if add_mock_data: add_mock_rainfall_data(vector_store) # Crawl the MOSDAC website if requested if crawl: print(f"Crawling MOSDAC website (max {max_pages} pages, depth {max_depth})...") crawler = MOSDACCrawler() documents = crawler.crawl(max_pages=max_pages, max_depth=max_depth) if documents: print(f"Adding {len(documents)} documents from web crawl to vector store...") vector_store.add_documents(documents) # Process local documents if a directory is provided if documents_dir and os.path.isdir(documents_dir): process_local_documents(documents_dir, vector_store) print("Vector store initialization complete.") def process_local_documents(documents_dir: str, vector_store: VectorStoreService) -> None: """ Process local documents and add them to the vector store Args: documents_dir: Directory containing documents to process vector_store: Vector store service instance """ print(f"Processing local documents in {documents_dir}...") # Initialize loaders pdf_loader = PDFLoader() docx_loader = DOCXLoader() excel_loader = ExcelLoader() # Walk through directory for root, _, files in os.walk(documents_dir): for file in files: file_path = os.path.join(root, file) file_lower = file.lower() try: if file_lower.endswith('.pdf'): documents = pdf_loader.load_file(file_path) if documents: print(f"Adding {len(documents)} chunks from PDF {file} to vector store...") vector_store.add_documents(documents) elif file_lower.endswith('.docx'): documents = docx_loader.load_file(file_path) if documents: print(f"Adding {len(documents)} chunks from DOCX {file} to vector store...") vector_store.add_documents(documents) elif file_lower.endswith(('.xlsx', '.xls')): documents = excel_loader.load_file(file_path) if documents: print(f"Adding {len(documents)} documents from Excel {file} to vector store...") vector_store.add_documents(documents) except Exception as e: print(f"Error processing {file_path}: {str(e)}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Initialize the vector store for MOSDAC AI Assistant") parser.add_argument("--crawl", action="store_true", help="Crawl the MOSDAC website") parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl") parser.add_argument("--max-depth", type=int, default=3, help="Maximum depth to crawl") parser.add_argument("--documents-dir", type=str, help="Directory containing local documents to process") parser.add_argument("--index-path", type=str, default="vector_index", help="Path to save the vector index") parser.add_argument("--reset", action="store_true", help="Reset the existing index") parser.add_argument("--add-mock-data", action="store_true", help="Add mock data to ensure critical data is available") parser.add_argument("--display", action="store_true", help="Display contents of the vector store") parser.add_argument("--query", type=str, default="rainfall data", help="Query to search for when displaying contents") parser.add_argument("--limit", type=int, default=5, help="Maximum number of documents to display") args = parser.parse_args() # Initialize vector store vector_store = VectorStoreService(index_path=args.index_path) # Process commands if args.reset: print("Resetting vector index...") vector_store.reset_index() if args.add_mock_data: add_mock_rainfall_data(vector_store) if args.crawl: initialize_vector_store( crawl=True, max_pages=args.max_pages, max_depth=args.max_depth, documents_dir=args.documents_dir, index_path=args.index_path, reset=False, # We've already handled reset add_mock_data=False # We've already handled mock data ) # Display contents if requested if args.display: display_vector_store_contents(vector_store, args.query, args.limit)