ASTROIQ / app /utils /vector_store_data.py
Ndg07's picture
Manual update from local script
ddffdb8
import sys
import os
# Add the backend directory to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
import argparse
import time
from typing import List, Optional
from langchain.schema import Document
from app.crawlers.mosdac_crawler import MOSDACCrawler
from app.services.vector_store_service import VectorStoreService
from app.document_loaders.pdf_loader import PDFLoader
from app.document_loaders.docx_loader import DOCXLoader
from app.document_loaders.excel_loader import ExcelLoader
# Add this function to display vector store contents
def display_vector_store_contents(vector_store: VectorStoreService, query: str = "rainfall data", limit: int = 5):
"""
Display the contents of the vector store
Args:
vector_store: Vector store service instance
query: Query to search for
limit: Maximum number of documents to display
"""
print(f"\n{'='*80}")
print(f"VECTOR STORE CONTENTS (Top {limit} results for query: '{query}')")
print(f"{'='*80}")
# Search for documents using the query
try:
docs = vector_store.similarity_search(query, k=limit)
if not docs:
print("\nNo documents found in the vector store.\n")
return
print(f"\nFound {len(docs)} documents:\n")
for i, doc in enumerate(docs):
print(f"DOCUMENT {i+1}:")
print(f" Source: {doc.metadata.get('source', 'Unknown')}")
print(f" Title: {doc.metadata.get('title', 'Untitled')}")
print(f" Crawled at: {doc.metadata.get('crawled_at', 'Unknown')}")
print(f" Data Type: {doc.metadata.get('dataType', 'Unknown')}")
# Show content preview (first 300 chars)
content_preview = doc.page_content[:300].replace('\n', ' ').strip()
if len(doc.page_content) > 300:
content_preview += "..."
print(f" Content preview: {content_preview}")
# Show more details if it's a rainfall document
if "rainfall" in doc.page_content.lower():
# Extract numerical data tables
if "NUMERICAL DATA TABLES:" in doc.page_content:
print("\n NUMERICAL DATA:")
data_section = doc.page_content.split("NUMERICAL DATA TABLES:")[1]
if "RAINFALL DATA:" in data_section:
data_section = data_section.split("RAINFALL DATA:")[0]
# Print first 10 lines of data
lines = [line.strip() for line in data_section.strip().split('\n') if line.strip()]
for line_num, line in enumerate(lines[:10]):
print(f" {line}")
if len(lines) > 10:
print(f" ... and {len(lines) - 10} more lines")
print("\n" + "-"*80)
except Exception as e:
print(f"Error displaying vector store contents: {str(e)}")
def add_mock_rainfall_data(vector_store: VectorStoreService) -> None:
"""
Add mock rainfall data to the vector store to ensure rainfall data is available
Args:
vector_store: Vector store service instance
"""
print("Adding mock rainfall data to ensure availability...")
# North East India rainfall data
ne_rainfall_doc = Document(
page_content="""
MOSDAC North East India Rainfall Data
NUMERICAL DATA TABLES:
Table 1: Daily Rainfall Data for North East India (mm) - Latest Readings
State | District | Rainfall (mm) | Date
Assam | Guwahati | 45.2 | 2023-08-15
Assam | Dibrugarh | 62.8 | 2023-08-15
Assam | Jorhat | 38.6 | 2023-08-15
Meghalaya | Shillong | 78.4 | 2023-08-15
Meghalaya | Cherrapunji | 125.6 | 2023-08-15
Arunachal Pradesh | Itanagar | 53.2 | 2023-08-15
Manipur | Imphal | 35.8 | 2023-08-15
Mizoram | Aizawl | 42.6 | 2023-08-15
Nagaland | Kohima | 47.3 | 2023-08-15
Tripura | Agartala | 51.9 | 2023-08-15
Table 2: Average Annual Rainfall for North East Indian States (2020-2023)
State | 2020 (mm) | 2021 (mm) | 2022 (mm) | 2023 (mm) | Average (mm)
Assam | 2756 | 2890 | 2845 | 2780 | 2818
Meghalaya | 11650 | 12100 | 11920 | 11820 | 11872
Arunachal Pradesh | 2980 | 3120 | 3080 | 2960 | 3035
Tripura | 2450 | 2580 | 2520 | 2450 | 2500
Manipur | 1420 | 1510 | 1480 | 1460 | 1467
Mizoram | 2580 | 2750 | 2690 | 2660 | 2670
Nagaland | 1820 | 1950 | 1890 | 1865 | 1881
RAINFALL DATA:
The North East region receives heavy rainfall during the monsoon season (June to September).
Cherrapunji and Mawsynram in Meghalaya are among the wettest places on Earth.
Historical Records:
- Cherrapunji's highest recorded daily rainfall: 1563.3 mm (June 16, 1995)
- Mawsynram's average annual rainfall: 11,862 mm (world record)
- Cherrapunji's average annual rainfall: 11,777 mm
Monthly breakdown of rainfall in North East India for 2023:
June: 385.2 mm
July: 512.8 mm
August: 485.6 mm
September: 337.8 mm
Key Facts:
- Region receives 80% of annual rainfall during monsoon (June-September)
- Pre-monsoon rainfall (March-May) accounts for 15-20% of annual total
- Winter rainfall (December-February) is minimal, about 2-5% of annual total
- Post-monsoon rainfall (October-November) contributes 8-10% of annual total
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-data-northeast",
"title": "MOSDAC - North East India Rainfall Data",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, north east, india, precipitation, monsoon, historical data",
"dataType": "rainfall"
}
)
# North India rainfall data
north_rainfall_doc = Document(
page_content="""
MOSDAC North India Rainfall Data
NUMERICAL DATA TABLES:
Table 1: Daily Rainfall Data for North India (mm)
State | District | Rainfall (mm) | Date
Uttar Pradesh | Lucknow | 28.5 | 2023-08-15
Uttar Pradesh | Varanasi | 32.4 | 2023-08-15
Delhi | New Delhi | 18.7 | 2023-08-15
Haryana | Chandigarh | 25.2 | 2023-08-15
Punjab | Amritsar | 21.8 | 2023-08-15
Himachal Pradesh | Shimla | 35.6 | 2023-08-15
Uttarakhand | Dehradun | 42.3 | 2023-08-15
Table 2: Average Annual Rainfall for North Indian States (mm)
State | Average Annual Rainfall (mm)
Uttar Pradesh | 990
Delhi | 820
Haryana | 620
Punjab | 649
Himachal Pradesh | 1520
Uttarakhand | 1605
Jammu and Kashmir | 1180
RAINFALL DATA:
North India receives most of its rainfall during the monsoon season.
Western Disturbances also bring rainfall during winter months.
Average monsoon rainfall for North India in 2023: 825.7 mm (4% below normal).
Monthly breakdown of rainfall in North India for 2023:
June: 158.3 mm
July: 284.5 mm
August: 262.4 mm
September: 120.5 mm
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-data-north",
"title": "MOSDAC - North India Rainfall Data",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, north india, precipitation, monsoon",
"dataType": "rainfall"
}
)
# General rainfall description document with more detailed numerical data
general_rainfall_doc = Document(
page_content="""
MOSDAC Rainfall Data Overview for India
NUMERICAL DATA TABLES:
Table 1: Monthly Rainfall Data for Different Regions of India (mm) - 2023
Region | June | July | August | September | Total
North India | 158.3 | 284.5 | 262.4 | 120.5 | 825.7
North East India | 385.2 | 512.8 | 485.6 | 337.8 | 1721.4
Central India | 172.8 | 325.6 | 295.4 | 156.3 | 950.1
South Peninsula | 125.4 | 196.8 | 178.5 | 195.6 | 696.3
Western India | 138.7 | 354.2 | 312.8 | 98.5 | 904.2
Table 2: Highest Daily Rainfall Records (mm)
Location | State | Rainfall (mm) | Date
Mawsynram | Meghalaya | 1003.6 | 1985-06-16
Cherrapunji | Meghalaya | 978.3 | 1995-06-12
Mumbai | Maharashtra | 944.2 | 2005-07-26
Dharampur | Gujarat | 823.6 | 2014-07-04
Agumbe | Karnataka | 738.9 | 2019-08-11
RAINFALL DATA:
The Indian monsoon is vital for the country's agriculture and water resources.
India receives about 80% of its annual rainfall during the monsoon season.
Average annual rainfall across India: Approximately 1187 mm.
Regional Variations in Rainfall:
- North East India: Receives highest rainfall, with Mawsynram and Cherrapunji among the wettest places on Earth.
- Western Ghats: High rainfall during monsoon, particularly in coastal Karnataka and Kerala.
- Thar Desert: Lowest rainfall, often less than 300 mm annually.
- Rajasthan: Average annual rainfall around 400-450 mm, highly variable.
- Gangetic Plains: Moderate rainfall, approximately 800-1000 mm annually.
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-overview-india",
"title": "MOSDAC - India Rainfall Data Overview",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, india, precipitation, monsoon, regional rainfall",
"dataType": "rainfall"
}
)
# Add the documents to the vector store
rainfall_docs = [ne_rainfall_doc, north_rainfall_doc, general_rainfall_doc]
vector_store.add_documents(rainfall_docs)
print(f"Added {len(rainfall_docs)} mock rainfall data documents to vector store")
def initialize_vector_store(
crawl: bool = True,
max_pages: int = 100,
max_depth: int = 3,
documents_dir: Optional[str] = None,
index_path: str = "vector_index",
reset: bool = False,
add_mock_data: bool = False
) -> None:
"""
Initialize the vector store by crawling the MOSDAC website and/or
processing local documents
Args:
crawl: Whether to crawl the MOSDAC website
max_pages: Maximum number of pages to crawl
max_depth: Maximum depth to crawl
documents_dir: Directory containing local documents to process
index_path: Path to save the vector index
reset: Whether to reset the existing index
add_mock_data: Whether to add mock data to ensure critical data is available
"""
# Initialize vector store service
vector_store = VectorStoreService(index_path=index_path)
# Reset if requested
if reset:
print("Resetting vector index...")
vector_store.reset_index()
# Always add mock rainfall data if requested
if add_mock_data:
add_mock_rainfall_data(vector_store)
# Crawl the MOSDAC website if requested
if crawl:
print(f"Crawling MOSDAC website (max {max_pages} pages, depth {max_depth})...")
crawler = MOSDACCrawler()
documents = crawler.crawl(max_pages=max_pages, max_depth=max_depth)
if documents:
print(f"Adding {len(documents)} documents from web crawl to vector store...")
vector_store.add_documents(documents)
# Process local documents if a directory is provided
if documents_dir and os.path.isdir(documents_dir):
process_local_documents(documents_dir, vector_store)
print("Vector store initialization complete.")
def process_local_documents(documents_dir: str, vector_store: VectorStoreService) -> None:
"""
Process local documents and add them to the vector store
Args:
documents_dir: Directory containing documents to process
vector_store: Vector store service instance
"""
print(f"Processing local documents in {documents_dir}...")
# Initialize loaders
pdf_loader = PDFLoader()
docx_loader = DOCXLoader()
excel_loader = ExcelLoader()
# Walk through directory
for root, _, files in os.walk(documents_dir):
for file in files:
file_path = os.path.join(root, file)
file_lower = file.lower()
try:
if file_lower.endswith('.pdf'):
documents = pdf_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} chunks from PDF {file} to vector store...")
vector_store.add_documents(documents)
elif file_lower.endswith('.docx'):
documents = docx_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} chunks from DOCX {file} to vector store...")
vector_store.add_documents(documents)
elif file_lower.endswith(('.xlsx', '.xls')):
documents = excel_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} documents from Excel {file} to vector store...")
vector_store.add_documents(documents)
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Initialize the vector store for MOSDAC AI Assistant")
parser.add_argument("--crawl", action="store_true", help="Crawl the MOSDAC website")
parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl")
parser.add_argument("--max-depth", type=int, default=3, help="Maximum depth to crawl")
parser.add_argument("--documents-dir", type=str, help="Directory containing local documents to process")
parser.add_argument("--index-path", type=str, default="vector_index", help="Path to save the vector index")
parser.add_argument("--reset", action="store_true", help="Reset the existing index")
parser.add_argument("--add-mock-data", action="store_true", help="Add mock data to ensure critical data is available")
parser.add_argument("--display", action="store_true", help="Display contents of the vector store")
parser.add_argument("--query", type=str, default="rainfall data", help="Query to search for when displaying contents")
parser.add_argument("--limit", type=int, default=5, help="Maximum number of documents to display")
args = parser.parse_args()
# Initialize vector store
vector_store = VectorStoreService(index_path=args.index_path)
# Process commands
if args.reset:
print("Resetting vector index...")
vector_store.reset_index()
if args.add_mock_data:
add_mock_rainfall_data(vector_store)
if args.crawl:
initialize_vector_store(
crawl=True,
max_pages=args.max_pages,
max_depth=args.max_depth,
documents_dir=args.documents_dir,
index_path=args.index_path,
reset=False, # We've already handled reset
add_mock_data=False # We've already handled mock data
)
# Display contents if requested
if args.display:
display_vector_store_contents(vector_store, args.query, args.limit)