| import os |
| import argparse |
| import time |
| from typing import List, Optional |
| from langchain.schema import Document |
|
|
| from app.crawlers.mosdac_crawler import MOSDACCrawler |
| from app.services.vector_store_service import VectorStoreService |
| from app.document_loaders.pdf_loader import PDFLoader |
| from app.document_loaders.docx_loader import DOCXLoader |
| from app.document_loaders.excel_loader import ExcelLoader |
|
|
| def add_mock_rainfall_data(vector_store: VectorStoreService) -> None: |
| """ |
| Add mock rainfall data to the vector store to ensure rainfall data is available |
| |
| Args: |
| vector_store: Vector store service instance |
| """ |
| print("Adding mock rainfall data to ensure availability...") |
| |
| |
| ne_rainfall_doc = Document( |
| page_content=""" |
| MOSDAC North East India Rainfall Data |
| |
| NUMERICAL DATA TABLES: |
| |
| Table 1: Daily Rainfall Data for North East India (mm) |
| State | District | Rainfall (mm) | Date |
| Assam | Guwahati | 45.2 | 2023-08-15 |
| Assam | Dibrugarh | 62.8 | 2023-08-15 |
| Assam | Jorhat | 38.6 | 2023-08-15 |
| Meghalaya | Shillong | 78.4 | 2023-08-15 |
| Meghalaya | Cherrapunji | 125.6 | 2023-08-15 |
| Arunachal Pradesh | Itanagar | 53.2 | 2023-08-15 |
| Manipur | Imphal | 35.8 | 2023-08-15 |
| Mizoram | Aizawl | 42.6 | 2023-08-15 |
| Nagaland | Kohima | 47.3 | 2023-08-15 |
| Tripura | Agartala | 51.9 | 2023-08-15 |
| |
| Table 2: Average Annual Rainfall for North East Indian States (mm) |
| State | Average Annual Rainfall (mm) |
| Assam | 2818 |
| Meghalaya | 11872 |
| Arunachal Pradesh | 3035 |
| Tripura | 2500 |
| Manipur | 1467 |
| Mizoram | 2670 |
| Nagaland | 1881 |
| |
| RAINFALL DATA: |
| The North East region receives heavy rainfall during the monsoon season (June to September). |
| Cherrapunji and Mawsynram in Meghalaya are among the wettest places on Earth. |
| Average monsoon rainfall for North East India in 2023: 1721.4 mm (8% above normal). |
| |
| Monthly breakdown of rainfall in North East India for 2023: |
| June: 385.2 mm |
| July: 512.8 mm |
| August: 485.6 mm |
| September: 337.8 mm |
| """, |
| metadata={ |
| "source": "https://www.mosdac.gov.in/rainfall-data-northeast", |
| "title": "MOSDAC - North East India Rainfall Data", |
| "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), |
| "keywords": "rainfall data, north east, india, precipitation, monsoon", |
| "dataType": "rainfall" |
| } |
| ) |
| |
| |
| north_rainfall_doc = Document( |
| page_content=""" |
| MOSDAC North India Rainfall Data |
| |
| NUMERICAL DATA TABLES: |
| |
| Table 1: Daily Rainfall Data for North India (mm) |
| State | District | Rainfall (mm) | Date |
| Uttar Pradesh | Lucknow | 28.5 | 2023-08-15 |
| Uttar Pradesh | Varanasi | 32.4 | 2023-08-15 |
| Delhi | New Delhi | 18.7 | 2023-08-15 |
| Haryana | Chandigarh | 25.2 | 2023-08-15 |
| Punjab | Amritsar | 21.8 | 2023-08-15 |
| Himachal Pradesh | Shimla | 35.6 | 2023-08-15 |
| Uttarakhand | Dehradun | 42.3 | 2023-08-15 |
| |
| Table 2: Average Annual Rainfall for North Indian States (mm) |
| State | Average Annual Rainfall (mm) |
| Uttar Pradesh | 990 |
| Delhi | 820 |
| Haryana | 620 |
| Punjab | 649 |
| Himachal Pradesh | 1520 |
| Uttarakhand | 1605 |
| Jammu and Kashmir | 1180 |
| |
| RAINFALL DATA: |
| North India receives most of its rainfall during the monsoon season. |
| Western Disturbances also bring rainfall during winter months. |
| Average monsoon rainfall for North India in 2023: 825.7 mm (4% below normal). |
| |
| Monthly breakdown of rainfall in North India for 2023: |
| June: 158.3 mm |
| July: 284.5 mm |
| August: 262.4 mm |
| September: 120.5 mm |
| """, |
| metadata={ |
| "source": "https://www.mosdac.gov.in/rainfall-data-north", |
| "title": "MOSDAC - North India Rainfall Data", |
| "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), |
| "keywords": "rainfall data, north india, precipitation, monsoon", |
| "dataType": "rainfall" |
| } |
| ) |
| |
| |
| general_rainfall_doc = Document( |
| page_content=""" |
| MOSDAC Rainfall Data Overview for India |
| |
| NUMERICAL DATA TABLES: |
| |
| Table 1: Monthly Rainfall Data for Different Regions of India (mm) - 2023 |
| Region | June | July | August | September | Total |
| North India | 158.3 | 284.5 | 262.4 | 120.5 | 825.7 |
| North East India | 385.2 | 512.8 | 485.6 | 337.8 | 1721.4 |
| Central India | 172.8 | 325.6 | 295.4 | 156.3 | 950.1 |
| South Peninsula | 125.4 | 196.8 | 178.5 | 195.6 | 696.3 |
| Western India | 138.7 | 354.2 | 312.8 | 98.5 | 904.2 |
| |
| Table 2: Highest Daily Rainfall Records (mm) |
| Location | State | Rainfall (mm) | Date |
| Mawsynram | Meghalaya | 1003.6 | 1985-06-16 |
| Cherrapunji | Meghalaya | 978.3 | 1995-06-12 |
| Mumbai | Maharashtra | 944.2 | 2005-07-26 |
| Dharampur | Gujarat | 823.6 | 2014-07-04 |
| Agumbe | Karnataka | 738.9 | 2019-08-11 |
| |
| RAINFALL DATA: |
| The Indian monsoon is vital for the country's agriculture and water resources. |
| India receives about 80% of its annual rainfall during the monsoon season. |
| Average annual rainfall across India: Approximately 1187 mm. |
| |
| Regional Variations in Rainfall: |
| - North East India: Receives highest rainfall, with Mawsynram and Cherrapunji among the wettest places on Earth. |
| - Western Ghats: High rainfall during monsoon, particularly in coastal Karnataka and Kerala. |
| - Thar Desert: Lowest rainfall, often less than 300 mm annually. |
| - Rajasthan: Average annual rainfall around 400-450 mm, highly variable. |
| - Gangetic Plains: Moderate rainfall, approximately 800-1000 mm annually. |
| """, |
| metadata={ |
| "source": "https://www.mosdac.gov.in/rainfall-overview-india", |
| "title": "MOSDAC - India Rainfall Data Overview", |
| "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), |
| "keywords": "rainfall data, india, precipitation, monsoon, regional rainfall", |
| "dataType": "rainfall" |
| } |
| ) |
| |
| |
| rainfall_docs = [ne_rainfall_doc, north_rainfall_doc, general_rainfall_doc] |
| vector_store.add_documents(rainfall_docs) |
| print(f"Added {len(rainfall_docs)} mock rainfall data documents to vector store") |
|
|
| def initialize_vector_store( |
| crawl: bool = True, |
| max_pages: int = 100, |
| max_depth: int = 3, |
| documents_dir: Optional[str] = None, |
| index_path: str = "vector_index", |
| reset: bool = False, |
| add_mock_data: bool = False |
| ) -> None: |
| """ |
| Initialize the vector store by crawling the MOSDAC website and/or |
| processing local documents |
| |
| Args: |
| crawl: Whether to crawl the MOSDAC website |
| max_pages: Maximum number of pages to crawl |
| max_depth: Maximum depth to crawl |
| documents_dir: Directory containing local documents to process |
| index_path: Path to save the vector index |
| reset: Whether to reset the existing index |
| add_mock_data: Whether to add mock data to ensure critical data is available |
| """ |
| |
| vector_store = VectorStoreService(index_path=index_path) |
| |
| |
| if reset: |
| print("Resetting vector index...") |
| vector_store.reset_index() |
| |
| |
| if add_mock_data: |
| add_mock_rainfall_data(vector_store) |
| |
| |
| if crawl: |
| print(f"Crawling MOSDAC website (max {max_pages} pages, depth {max_depth})...") |
| crawler = MOSDACCrawler() |
| documents = crawler.crawl(max_pages=max_pages, max_depth=max_depth) |
| |
| if documents: |
| print(f"Adding {len(documents)} documents from web crawl to vector store...") |
| vector_store.add_documents(documents) |
| |
| |
| if documents_dir and os.path.isdir(documents_dir): |
| process_local_documents(documents_dir, vector_store) |
| |
| print("Vector store initialization complete.") |
|
|
| def process_local_documents(documents_dir: str, vector_store: VectorStoreService) -> None: |
| """ |
| Process local documents and add them to the vector store |
| |
| Args: |
| documents_dir: Directory containing documents to process |
| vector_store: Vector store service instance |
| """ |
| print(f"Processing local documents in {documents_dir}...") |
| |
| |
| pdf_loader = PDFLoader() |
| docx_loader = DOCXLoader() |
| excel_loader = ExcelLoader() |
| |
| |
| for root, _, files in os.walk(documents_dir): |
| for file in files: |
| file_path = os.path.join(root, file) |
| file_lower = file.lower() |
| |
| try: |
| if file_lower.endswith('.pdf'): |
| documents = pdf_loader.load_file(file_path) |
| if documents: |
| print(f"Adding {len(documents)} chunks from PDF {file} to vector store...") |
| vector_store.add_documents(documents) |
| |
| elif file_lower.endswith('.docx'): |
| documents = docx_loader.load_file(file_path) |
| if documents: |
| print(f"Adding {len(documents)} chunks from DOCX {file} to vector store...") |
| vector_store.add_documents(documents) |
| |
| elif file_lower.endswith(('.xlsx', '.xls')): |
| documents = excel_loader.load_file(file_path) |
| if documents: |
| print(f"Adding {len(documents)} documents from Excel {file} to vector store...") |
| vector_store.add_documents(documents) |
| |
| except Exception as e: |
| print(f"Error processing {file_path}: {str(e)}") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Initialize the vector store for MOSDAC AI Assistant") |
| |
| parser.add_argument("--crawl", action="store_true", help="Crawl the MOSDAC website") |
| parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl") |
| parser.add_argument("--max-depth", type=int, default=3, help="Maximum depth to crawl") |
| parser.add_argument("--documents-dir", type=str, help="Directory containing local documents to process") |
| parser.add_argument("--index-path", type=str, default="vector_index", help="Path to save the vector index") |
| parser.add_argument("--reset", action="store_true", help="Reset the existing index") |
| parser.add_argument("--add-mock-data", action="store_true", help="Add mock data to ensure critical data is available") |
| |
| args = parser.parse_args() |
| |
| initialize_vector_store( |
| crawl=args.crawl, |
| max_pages=args.max_pages, |
| max_depth=args.max_depth, |
| documents_dir=args.documents_dir, |
| index_path=args.index_path, |
| reset=args.reset, |
| add_mock_data=args.add_mock_data |
| ) |