Spaces:

Ndg07
/

ASTROIQ

Sleeping

File size: 15,974 Bytes

ddffdb8

import sys
import os
# Add the backend directory to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))

import argparse
import time
from typing import List, Optional
from langchain.schema import Document

from app.crawlers.mosdac_crawler import MOSDACCrawler
from app.services.vector_store_service import VectorStoreService
from app.document_loaders.pdf_loader import PDFLoader
from app.document_loaders.docx_loader import DOCXLoader
from app.document_loaders.excel_loader import ExcelLoader

# Add this function to display vector store contents
def display_vector_store_contents(vector_store: VectorStoreService, query: str = "rainfall data", limit: int = 5):
    """
    Display the contents of the vector store
    
    Args:
        vector_store: Vector store service instance
        query: Query to search for
        limit: Maximum number of documents to display
    """
    print(f"\n{'='*80}")
    print(f"VECTOR STORE CONTENTS (Top {limit} results for query: '{query}')")
    print(f"{'='*80}")
    
    # Search for documents using the query
    try:
        docs = vector_store.similarity_search(query, k=limit)
        
        if not docs:
            print("\nNo documents found in the vector store.\n")
            return
            
        print(f"\nFound {len(docs)} documents:\n")
        
        for i, doc in enumerate(docs):
            print(f"DOCUMENT {i+1}:")
            print(f"  Source: {doc.metadata.get('source', 'Unknown')}")
            print(f"  Title: {doc.metadata.get('title', 'Untitled')}")
            print(f"  Crawled at: {doc.metadata.get('crawled_at', 'Unknown')}")
            print(f"  Data Type: {doc.metadata.get('dataType', 'Unknown')}")
            
            # Show content preview (first 300 chars)
            content_preview = doc.page_content[:300].replace('\n', ' ').strip()
            if len(doc.page_content) > 300:
                content_preview += "..."
            print(f"  Content preview: {content_preview}")
            
            # Show more details if it's a rainfall document
            if "rainfall" in doc.page_content.lower():
                # Extract numerical data tables
                if "NUMERICAL DATA TABLES:" in doc.page_content:
                    print("\n  NUMERICAL DATA:")
                    data_section = doc.page_content.split("NUMERICAL DATA TABLES:")[1]
                    if "RAINFALL DATA:" in data_section:
                        data_section = data_section.split("RAINFALL DATA:")[0]
                    
                    # Print first 10 lines of data
                    lines = [line.strip() for line in data_section.strip().split('\n') if line.strip()]
                    for line_num, line in enumerate(lines[:10]):
                        print(f"    {line}")
                    if len(lines) > 10:
                        print(f"    ... and {len(lines) - 10} more lines")
            
            print("\n" + "-"*80)
    
    except Exception as e:
        print(f"Error displaying vector store contents: {str(e)}")

def add_mock_rainfall_data(vector_store: VectorStoreService) -> None:
    """
    Add mock rainfall data to the vector store to ensure rainfall data is available
    
    Args:
        vector_store: Vector store service instance
    """
    print("Adding mock rainfall data to ensure availability...")
    
    # North East India rainfall data
    ne_rainfall_doc = Document(
        page_content="""
        MOSDAC North East India Rainfall Data
        
        NUMERICAL DATA TABLES:
        
        Table 1: Daily Rainfall Data for North East India (mm) - Latest Readings
        State | District | Rainfall (mm) | Date
        Assam | Guwahati | 45.2 | 2023-08-15
        Assam | Dibrugarh | 62.8 | 2023-08-15
        Assam | Jorhat | 38.6 | 2023-08-15
        Meghalaya | Shillong | 78.4 | 2023-08-15
        Meghalaya | Cherrapunji | 125.6 | 2023-08-15
        Arunachal Pradesh | Itanagar | 53.2 | 2023-08-15
        Manipur | Imphal | 35.8 | 2023-08-15
        Mizoram | Aizawl | 42.6 | 2023-08-15
        Nagaland | Kohima | 47.3 | 2023-08-15
        Tripura | Agartala | 51.9 | 2023-08-15
        
        Table 2: Average Annual Rainfall for North East Indian States (2020-2023)
        State | 2020 (mm) | 2021 (mm) | 2022 (mm) | 2023 (mm) | Average (mm)
        Assam | 2756 | 2890 | 2845 | 2780 | 2818
        Meghalaya | 11650 | 12100 | 11920 | 11820 | 11872
        Arunachal Pradesh | 2980 | 3120 | 3080 | 2960 | 3035
        Tripura | 2450 | 2580 | 2520 | 2450 | 2500
        Manipur | 1420 | 1510 | 1480 | 1460 | 1467
        Mizoram | 2580 | 2750 | 2690 | 2660 | 2670
        Nagaland | 1820 | 1950 | 1890 | 1865 | 1881
        
        RAINFALL DATA:
        The North East region receives heavy rainfall during the monsoon season (June to September).
        Cherrapunji and Mawsynram in Meghalaya are among the wettest places on Earth.
        
        Historical Records:
        - Cherrapunji's highest recorded daily rainfall: 1563.3 mm (June 16, 1995)
        - Mawsynram's average annual rainfall: 11,862 mm (world record)
        - Cherrapunji's average annual rainfall: 11,777 mm
        
        Monthly breakdown of rainfall in North East India for 2023:
        June: 385.2 mm
        July: 512.8 mm
        August: 485.6 mm
        September: 337.8 mm
        
        Key Facts:
        - Region receives 80% of annual rainfall during monsoon (June-September)
        - Pre-monsoon rainfall (March-May) accounts for 15-20% of annual total
        - Winter rainfall (December-February) is minimal, about 2-5% of annual total
        - Post-monsoon rainfall (October-November) contributes 8-10% of annual total
        """,
        metadata={
            "source": "https://www.mosdac.gov.in/rainfall-data-northeast",
            "title": "MOSDAC - North East India Rainfall Data",
            "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
            "keywords": "rainfall data, north east, india, precipitation, monsoon, historical data",
            "dataType": "rainfall"
        }
    )
    
    # North India rainfall data
    north_rainfall_doc = Document(
        page_content="""
        MOSDAC North India Rainfall Data
        
        NUMERICAL DATA TABLES:
        
        Table 1: Daily Rainfall Data for North India (mm)
        State | District | Rainfall (mm) | Date
        Uttar Pradesh | Lucknow | 28.5 | 2023-08-15
        Uttar Pradesh | Varanasi | 32.4 | 2023-08-15
        Delhi | New Delhi | 18.7 | 2023-08-15
        Haryana | Chandigarh | 25.2 | 2023-08-15
        Punjab | Amritsar | 21.8 | 2023-08-15
        Himachal Pradesh | Shimla | 35.6 | 2023-08-15
        Uttarakhand | Dehradun | 42.3 | 2023-08-15
        
        Table 2: Average Annual Rainfall for North Indian States (mm)
        State | Average Annual Rainfall (mm)
        Uttar Pradesh | 990
        Delhi | 820
        Haryana | 620
        Punjab | 649
        Himachal Pradesh | 1520
        Uttarakhand | 1605
        Jammu and Kashmir | 1180
        
        RAINFALL DATA:
        North India receives most of its rainfall during the monsoon season.
        Western Disturbances also bring rainfall during winter months.
        Average monsoon rainfall for North India in 2023: 825.7 mm (4% below normal).
        
        Monthly breakdown of rainfall in North India for 2023:
        June: 158.3 mm
        July: 284.5 mm
        August: 262.4 mm
        September: 120.5 mm
        """,
        metadata={
            "source": "https://www.mosdac.gov.in/rainfall-data-north",
            "title": "MOSDAC - North India Rainfall Data",
            "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
            "keywords": "rainfall data, north india, precipitation, monsoon",
            "dataType": "rainfall"
        }
    )
    
    # General rainfall description document with more detailed numerical data
    general_rainfall_doc = Document(
        page_content="""
        MOSDAC Rainfall Data Overview for India
        
        NUMERICAL DATA TABLES:
        
        Table 1: Monthly Rainfall Data for Different Regions of India (mm) - 2023
        Region | June | July | August | September | Total
        North India | 158.3 | 284.5 | 262.4 | 120.5 | 825.7
        North East India | 385.2 | 512.8 | 485.6 | 337.8 | 1721.4
        Central India | 172.8 | 325.6 | 295.4 | 156.3 | 950.1
        South Peninsula | 125.4 | 196.8 | 178.5 | 195.6 | 696.3
        Western India | 138.7 | 354.2 | 312.8 | 98.5 | 904.2
        
        Table 2: Highest Daily Rainfall Records (mm)
        Location | State | Rainfall (mm) | Date
        Mawsynram | Meghalaya | 1003.6 | 1985-06-16
        Cherrapunji | Meghalaya | 978.3 | 1995-06-12
        Mumbai | Maharashtra | 944.2 | 2005-07-26
        Dharampur | Gujarat | 823.6 | 2014-07-04
        Agumbe | Karnataka | 738.9 | 2019-08-11
        
        RAINFALL DATA:
        The Indian monsoon is vital for the country's agriculture and water resources.
        India receives about 80% of its annual rainfall during the monsoon season.
        Average annual rainfall across India: Approximately 1187 mm.
        
        Regional Variations in Rainfall:
        - North East India: Receives highest rainfall, with Mawsynram and Cherrapunji among the wettest places on Earth.
        - Western Ghats: High rainfall during monsoon, particularly in coastal Karnataka and Kerala.
        - Thar Desert: Lowest rainfall, often less than 300 mm annually.
        - Rajasthan: Average annual rainfall around 400-450 mm, highly variable.
        - Gangetic Plains: Moderate rainfall, approximately 800-1000 mm annually.
        """,
        metadata={
            "source": "https://www.mosdac.gov.in/rainfall-overview-india",
            "title": "MOSDAC - India Rainfall Data Overview",
            "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
            "keywords": "rainfall data, india, precipitation, monsoon, regional rainfall",
            "dataType": "rainfall"
        }
    )
    
    # Add the documents to the vector store
    rainfall_docs = [ne_rainfall_doc, north_rainfall_doc, general_rainfall_doc]
    vector_store.add_documents(rainfall_docs)
    print(f"Added {len(rainfall_docs)} mock rainfall data documents to vector store")

def initialize_vector_store(
    crawl: bool = True,
    max_pages: int = 100,
    max_depth: int = 3,
    documents_dir: Optional[str] = None,
    index_path: str = "vector_index",
    reset: bool = False,
    add_mock_data: bool = False
) -> None:
    """
    Initialize the vector store by crawling the MOSDAC website and/or 
    processing local documents
    
    Args:
        crawl: Whether to crawl the MOSDAC website
        max_pages: Maximum number of pages to crawl
        max_depth: Maximum depth to crawl
        documents_dir: Directory containing local documents to process
        index_path: Path to save the vector index
        reset: Whether to reset the existing index
        add_mock_data: Whether to add mock data to ensure critical data is available
    """
    # Initialize vector store service
    vector_store = VectorStoreService(index_path=index_path)
    
    # Reset if requested
    if reset:
        print("Resetting vector index...")
        vector_store.reset_index()
    
    # Always add mock rainfall data if requested
    if add_mock_data:
        add_mock_rainfall_data(vector_store)
    
    # Crawl the MOSDAC website if requested
    if crawl:
        print(f"Crawling MOSDAC website (max {max_pages} pages, depth {max_depth})...")
        crawler = MOSDACCrawler()
        documents = crawler.crawl(max_pages=max_pages, max_depth=max_depth)
        
        if documents:
            print(f"Adding {len(documents)} documents from web crawl to vector store...")
            vector_store.add_documents(documents)
    
    # Process local documents if a directory is provided
    if documents_dir and os.path.isdir(documents_dir):
        process_local_documents(documents_dir, vector_store)
    
    print("Vector store initialization complete.")

def process_local_documents(documents_dir: str, vector_store: VectorStoreService) -> None:
    """
    Process local documents and add them to the vector store
    
    Args:
        documents_dir: Directory containing documents to process
        vector_store: Vector store service instance
    """
    print(f"Processing local documents in {documents_dir}...")
    
    # Initialize loaders
    pdf_loader = PDFLoader()
    docx_loader = DOCXLoader()
    excel_loader = ExcelLoader()
    
    # Walk through directory
    for root, _, files in os.walk(documents_dir):
        for file in files:
            file_path = os.path.join(root, file)
            file_lower = file.lower()
            
            try:
                if file_lower.endswith('.pdf'):
                    documents = pdf_loader.load_file(file_path)
                    if documents:
                        print(f"Adding {len(documents)} chunks from PDF {file} to vector store...")
                        vector_store.add_documents(documents)
                
                elif file_lower.endswith('.docx'):
                    documents = docx_loader.load_file(file_path)
                    if documents:
                        print(f"Adding {len(documents)} chunks from DOCX {file} to vector store...")
                        vector_store.add_documents(documents)
                
                elif file_lower.endswith(('.xlsx', '.xls')):
                    documents = excel_loader.load_file(file_path)
                    if documents:
                        print(f"Adding {len(documents)} documents from Excel {file} to vector store...")
                        vector_store.add_documents(documents)
            
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Initialize the vector store for MOSDAC AI Assistant")
    
    parser.add_argument("--crawl", action="store_true", help="Crawl the MOSDAC website")
    parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl")
    parser.add_argument("--max-depth", type=int, default=3, help="Maximum depth to crawl")
    parser.add_argument("--documents-dir", type=str, help="Directory containing local documents to process")
    parser.add_argument("--index-path", type=str, default="vector_index", help="Path to save the vector index")
    parser.add_argument("--reset", action="store_true", help="Reset the existing index")
    parser.add_argument("--add-mock-data", action="store_true", help="Add mock data to ensure critical data is available")
    parser.add_argument("--display", action="store_true", help="Display contents of the vector store")
    parser.add_argument("--query", type=str, default="rainfall data", help="Query to search for when displaying contents")
    parser.add_argument("--limit", type=int, default=5, help="Maximum number of documents to display")
    
    args = parser.parse_args()
    
    # Initialize vector store
    vector_store = VectorStoreService(index_path=args.index_path)
    
    # Process commands
    if args.reset:
        print("Resetting vector index...")
        vector_store.reset_index()
    
    if args.add_mock_data:
        add_mock_rainfall_data(vector_store)
    
    if args.crawl:
        initialize_vector_store(
            crawl=True,
            max_pages=args.max_pages,
            max_depth=args.max_depth,
            documents_dir=args.documents_dir,
            index_path=args.index_path,
            reset=False,  # We've already handled reset
            add_mock_data=False  # We've already handled mock data
        )
    
    # Display contents if requested
    if args.display:
        display_vector_store_contents(vector_store, args.query, args.limit)