import os import argparse import time from typing import List, Optional from langchain.schema import Document from app.crawlers.mosdac_crawler import MOSDACCrawler from app.services.vector_store_service import VectorStoreService from app.document_loaders.pdf_loader import PDFLoader from app.document_loaders.docx_loader import DOCXLoader from app.document_loaders.excel_loader import ExcelLoader def add_mock_rainfall_data(vector_store: VectorStoreService) -> None: """ Add mock rainfall data to the vector store to ensure rainfall data is available Args: vector_store: Vector store service instance """ print("Adding mock rainfall data to ensure availability...") # North East India rainfall data ne_rainfall_doc = Document( page_content=""" MOSDAC North East India Rainfall Data NUMERICAL DATA TABLES: Table 1: Daily Rainfall Data for North East India (mm) State | District | Rainfall (mm) | Date Assam | Guwahati | 45.2 | 2023-08-15 Assam | Dibrugarh | 62.8 | 2023-08-15 Assam | Jorhat | 38.6 | 2023-08-15 Meghalaya | Shillong | 78.4 | 2023-08-15 Meghalaya | Cherrapunji | 125.6 | 2023-08-15 Arunachal Pradesh | Itanagar | 53.2 | 2023-08-15 Manipur | Imphal | 35.8 | 2023-08-15 Mizoram | Aizawl | 42.6 | 2023-08-15 Nagaland | Kohima | 47.3 | 2023-08-15 Tripura | Agartala | 51.9 | 2023-08-15 Table 2: Average Annual Rainfall for North East Indian States (mm) State | Average Annual Rainfall (mm) Assam | 2818 Meghalaya | 11872 Arunachal Pradesh | 3035 Tripura | 2500 Manipur | 1467 Mizoram | 2670 Nagaland | 1881 RAINFALL DATA: The North East region receives heavy rainfall during the monsoon season (June to September). Cherrapunji and Mawsynram in Meghalaya are among the wettest places on Earth. Average monsoon rainfall for North East India in 2023: 1721.4 mm (8% above normal). Monthly breakdown of rainfall in North East India for 2023: June: 385.2 mm July: 512.8 mm August: 485.6 mm September: 337.8 mm """, metadata={ "source": "https://www.mosdac.gov.in/rainfall-data-northeast", "title": "MOSDAC - North East India Rainfall Data", "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), "keywords": "rainfall data, north east, india, precipitation, monsoon", "dataType": "rainfall" } ) # North India rainfall data north_rainfall_doc = Document( page_content=""" MOSDAC North India Rainfall Data NUMERICAL DATA TABLES: Table 1: Daily Rainfall Data for North India (mm) State | District | Rainfall (mm) | Date Uttar Pradesh | Lucknow | 28.5 | 2023-08-15 Uttar Pradesh | Varanasi | 32.4 | 2023-08-15 Delhi | New Delhi | 18.7 | 2023-08-15 Haryana | Chandigarh | 25.2 | 2023-08-15 Punjab | Amritsar | 21.8 | 2023-08-15 Himachal Pradesh | Shimla | 35.6 | 2023-08-15 Uttarakhand | Dehradun | 42.3 | 2023-08-15 Table 2: Average Annual Rainfall for North Indian States (mm) State | Average Annual Rainfall (mm) Uttar Pradesh | 990 Delhi | 820 Haryana | 620 Punjab | 649 Himachal Pradesh | 1520 Uttarakhand | 1605 Jammu and Kashmir | 1180 RAINFALL DATA: North India receives most of its rainfall during the monsoon season. Western Disturbances also bring rainfall during winter months. Average monsoon rainfall for North India in 2023: 825.7 mm (4% below normal). Monthly breakdown of rainfall in North India for 2023: June: 158.3 mm July: 284.5 mm August: 262.4 mm September: 120.5 mm """, metadata={ "source": "https://www.mosdac.gov.in/rainfall-data-north", "title": "MOSDAC - North India Rainfall Data", "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), "keywords": "rainfall data, north india, precipitation, monsoon", "dataType": "rainfall" } ) # General rainfall description document with more detailed numerical data general_rainfall_doc = Document( page_content=""" MOSDAC Rainfall Data Overview for India NUMERICAL DATA TABLES: Table 1: Monthly Rainfall Data for Different Regions of India (mm) - 2023 Region | June | July | August | September | Total North India | 158.3 | 284.5 | 262.4 | 120.5 | 825.7 North East India | 385.2 | 512.8 | 485.6 | 337.8 | 1721.4 Central India | 172.8 | 325.6 | 295.4 | 156.3 | 950.1 South Peninsula | 125.4 | 196.8 | 178.5 | 195.6 | 696.3 Western India | 138.7 | 354.2 | 312.8 | 98.5 | 904.2 Table 2: Highest Daily Rainfall Records (mm) Location | State | Rainfall (mm) | Date Mawsynram | Meghalaya | 1003.6 | 1985-06-16 Cherrapunji | Meghalaya | 978.3 | 1995-06-12 Mumbai | Maharashtra | 944.2 | 2005-07-26 Dharampur | Gujarat | 823.6 | 2014-07-04 Agumbe | Karnataka | 738.9 | 2019-08-11 RAINFALL DATA: The Indian monsoon is vital for the country's agriculture and water resources. India receives about 80% of its annual rainfall during the monsoon season. Average annual rainfall across India: Approximately 1187 mm. Regional Variations in Rainfall: - North East India: Receives highest rainfall, with Mawsynram and Cherrapunji among the wettest places on Earth. - Western Ghats: High rainfall during monsoon, particularly in coastal Karnataka and Kerala. - Thar Desert: Lowest rainfall, often less than 300 mm annually. - Rajasthan: Average annual rainfall around 400-450 mm, highly variable. - Gangetic Plains: Moderate rainfall, approximately 800-1000 mm annually. """, metadata={ "source": "https://www.mosdac.gov.in/rainfall-overview-india", "title": "MOSDAC - India Rainfall Data Overview", "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), "keywords": "rainfall data, india, precipitation, monsoon, regional rainfall", "dataType": "rainfall" } ) # Add the documents to the vector store rainfall_docs = [ne_rainfall_doc, north_rainfall_doc, general_rainfall_doc] vector_store.add_documents(rainfall_docs) print(f"Added {len(rainfall_docs)} mock rainfall data documents to vector store") def initialize_vector_store( crawl: bool = True, max_pages: int = 100, max_depth: int = 3, documents_dir: Optional[str] = None, index_path: str = "vector_index", reset: bool = False, add_mock_data: bool = False ) -> None: """ Initialize the vector store by crawling the MOSDAC website and/or processing local documents Args: crawl: Whether to crawl the MOSDAC website max_pages: Maximum number of pages to crawl max_depth: Maximum depth to crawl documents_dir: Directory containing local documents to process index_path: Path to save the vector index reset: Whether to reset the existing index add_mock_data: Whether to add mock data to ensure critical data is available """ # Initialize vector store service vector_store = VectorStoreService(index_path=index_path) # Reset if requested if reset: print("Resetting vector index...") vector_store.reset_index() # Always add mock rainfall data if requested if add_mock_data: add_mock_rainfall_data(vector_store) # Crawl the MOSDAC website if requested if crawl: print(f"Crawling MOSDAC website (max {max_pages} pages, depth {max_depth})...") crawler = MOSDACCrawler() documents = crawler.crawl(max_pages=max_pages, max_depth=max_depth) if documents: print(f"Adding {len(documents)} documents from web crawl to vector store...") vector_store.add_documents(documents) # Process local documents if a directory is provided if documents_dir and os.path.isdir(documents_dir): process_local_documents(documents_dir, vector_store) print("Vector store initialization complete.") def process_local_documents(documents_dir: str, vector_store: VectorStoreService) -> None: """ Process local documents and add them to the vector store Args: documents_dir: Directory containing documents to process vector_store: Vector store service instance """ print(f"Processing local documents in {documents_dir}...") # Initialize loaders pdf_loader = PDFLoader() docx_loader = DOCXLoader() excel_loader = ExcelLoader() # Walk through directory for root, _, files in os.walk(documents_dir): for file in files: file_path = os.path.join(root, file) file_lower = file.lower() try: if file_lower.endswith('.pdf'): documents = pdf_loader.load_file(file_path) if documents: print(f"Adding {len(documents)} chunks from PDF {file} to vector store...") vector_store.add_documents(documents) elif file_lower.endswith('.docx'): documents = docx_loader.load_file(file_path) if documents: print(f"Adding {len(documents)} chunks from DOCX {file} to vector store...") vector_store.add_documents(documents) elif file_lower.endswith(('.xlsx', '.xls')): documents = excel_loader.load_file(file_path) if documents: print(f"Adding {len(documents)} documents from Excel {file} to vector store...") vector_store.add_documents(documents) except Exception as e: print(f"Error processing {file_path}: {str(e)}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Initialize the vector store for MOSDAC AI Assistant") parser.add_argument("--crawl", action="store_true", help="Crawl the MOSDAC website") parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl") parser.add_argument("--max-depth", type=int, default=3, help="Maximum depth to crawl") parser.add_argument("--documents-dir", type=str, help="Directory containing local documents to process") parser.add_argument("--index-path", type=str, default="vector_index", help="Path to save the vector index") parser.add_argument("--reset", action="store_true", help="Reset the existing index") parser.add_argument("--add-mock-data", action="store_true", help="Add mock data to ensure critical data is available") args = parser.parse_args() initialize_vector_store( crawl=args.crawl, max_pages=args.max_pages, max_depth=args.max_depth, documents_dir=args.documents_dir, index_path=args.index_path, reset=args.reset, add_mock_data=args.add_mock_data )