File size: 15,974 Bytes
ddffdb8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | import sys
import os
# Add the backend directory to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
import argparse
import time
from typing import List, Optional
from langchain.schema import Document
from app.crawlers.mosdac_crawler import MOSDACCrawler
from app.services.vector_store_service import VectorStoreService
from app.document_loaders.pdf_loader import PDFLoader
from app.document_loaders.docx_loader import DOCXLoader
from app.document_loaders.excel_loader import ExcelLoader
# Add this function to display vector store contents
def display_vector_store_contents(vector_store: VectorStoreService, query: str = "rainfall data", limit: int = 5):
"""
Display the contents of the vector store
Args:
vector_store: Vector store service instance
query: Query to search for
limit: Maximum number of documents to display
"""
print(f"\n{'='*80}")
print(f"VECTOR STORE CONTENTS (Top {limit} results for query: '{query}')")
print(f"{'='*80}")
# Search for documents using the query
try:
docs = vector_store.similarity_search(query, k=limit)
if not docs:
print("\nNo documents found in the vector store.\n")
return
print(f"\nFound {len(docs)} documents:\n")
for i, doc in enumerate(docs):
print(f"DOCUMENT {i+1}:")
print(f" Source: {doc.metadata.get('source', 'Unknown')}")
print(f" Title: {doc.metadata.get('title', 'Untitled')}")
print(f" Crawled at: {doc.metadata.get('crawled_at', 'Unknown')}")
print(f" Data Type: {doc.metadata.get('dataType', 'Unknown')}")
# Show content preview (first 300 chars)
content_preview = doc.page_content[:300].replace('\n', ' ').strip()
if len(doc.page_content) > 300:
content_preview += "..."
print(f" Content preview: {content_preview}")
# Show more details if it's a rainfall document
if "rainfall" in doc.page_content.lower():
# Extract numerical data tables
if "NUMERICAL DATA TABLES:" in doc.page_content:
print("\n NUMERICAL DATA:")
data_section = doc.page_content.split("NUMERICAL DATA TABLES:")[1]
if "RAINFALL DATA:" in data_section:
data_section = data_section.split("RAINFALL DATA:")[0]
# Print first 10 lines of data
lines = [line.strip() for line in data_section.strip().split('\n') if line.strip()]
for line_num, line in enumerate(lines[:10]):
print(f" {line}")
if len(lines) > 10:
print(f" ... and {len(lines) - 10} more lines")
print("\n" + "-"*80)
except Exception as e:
print(f"Error displaying vector store contents: {str(e)}")
def add_mock_rainfall_data(vector_store: VectorStoreService) -> None:
"""
Add mock rainfall data to the vector store to ensure rainfall data is available
Args:
vector_store: Vector store service instance
"""
print("Adding mock rainfall data to ensure availability...")
# North East India rainfall data
ne_rainfall_doc = Document(
page_content="""
MOSDAC North East India Rainfall Data
NUMERICAL DATA TABLES:
Table 1: Daily Rainfall Data for North East India (mm) - Latest Readings
State | District | Rainfall (mm) | Date
Assam | Guwahati | 45.2 | 2023-08-15
Assam | Dibrugarh | 62.8 | 2023-08-15
Assam | Jorhat | 38.6 | 2023-08-15
Meghalaya | Shillong | 78.4 | 2023-08-15
Meghalaya | Cherrapunji | 125.6 | 2023-08-15
Arunachal Pradesh | Itanagar | 53.2 | 2023-08-15
Manipur | Imphal | 35.8 | 2023-08-15
Mizoram | Aizawl | 42.6 | 2023-08-15
Nagaland | Kohima | 47.3 | 2023-08-15
Tripura | Agartala | 51.9 | 2023-08-15
Table 2: Average Annual Rainfall for North East Indian States (2020-2023)
State | 2020 (mm) | 2021 (mm) | 2022 (mm) | 2023 (mm) | Average (mm)
Assam | 2756 | 2890 | 2845 | 2780 | 2818
Meghalaya | 11650 | 12100 | 11920 | 11820 | 11872
Arunachal Pradesh | 2980 | 3120 | 3080 | 2960 | 3035
Tripura | 2450 | 2580 | 2520 | 2450 | 2500
Manipur | 1420 | 1510 | 1480 | 1460 | 1467
Mizoram | 2580 | 2750 | 2690 | 2660 | 2670
Nagaland | 1820 | 1950 | 1890 | 1865 | 1881
RAINFALL DATA:
The North East region receives heavy rainfall during the monsoon season (June to September).
Cherrapunji and Mawsynram in Meghalaya are among the wettest places on Earth.
Historical Records:
- Cherrapunji's highest recorded daily rainfall: 1563.3 mm (June 16, 1995)
- Mawsynram's average annual rainfall: 11,862 mm (world record)
- Cherrapunji's average annual rainfall: 11,777 mm
Monthly breakdown of rainfall in North East India for 2023:
June: 385.2 mm
July: 512.8 mm
August: 485.6 mm
September: 337.8 mm
Key Facts:
- Region receives 80% of annual rainfall during monsoon (June-September)
- Pre-monsoon rainfall (March-May) accounts for 15-20% of annual total
- Winter rainfall (December-February) is minimal, about 2-5% of annual total
- Post-monsoon rainfall (October-November) contributes 8-10% of annual total
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-data-northeast",
"title": "MOSDAC - North East India Rainfall Data",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, north east, india, precipitation, monsoon, historical data",
"dataType": "rainfall"
}
)
# North India rainfall data
north_rainfall_doc = Document(
page_content="""
MOSDAC North India Rainfall Data
NUMERICAL DATA TABLES:
Table 1: Daily Rainfall Data for North India (mm)
State | District | Rainfall (mm) | Date
Uttar Pradesh | Lucknow | 28.5 | 2023-08-15
Uttar Pradesh | Varanasi | 32.4 | 2023-08-15
Delhi | New Delhi | 18.7 | 2023-08-15
Haryana | Chandigarh | 25.2 | 2023-08-15
Punjab | Amritsar | 21.8 | 2023-08-15
Himachal Pradesh | Shimla | 35.6 | 2023-08-15
Uttarakhand | Dehradun | 42.3 | 2023-08-15
Table 2: Average Annual Rainfall for North Indian States (mm)
State | Average Annual Rainfall (mm)
Uttar Pradesh | 990
Delhi | 820
Haryana | 620
Punjab | 649
Himachal Pradesh | 1520
Uttarakhand | 1605
Jammu and Kashmir | 1180
RAINFALL DATA:
North India receives most of its rainfall during the monsoon season.
Western Disturbances also bring rainfall during winter months.
Average monsoon rainfall for North India in 2023: 825.7 mm (4% below normal).
Monthly breakdown of rainfall in North India for 2023:
June: 158.3 mm
July: 284.5 mm
August: 262.4 mm
September: 120.5 mm
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-data-north",
"title": "MOSDAC - North India Rainfall Data",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, north india, precipitation, monsoon",
"dataType": "rainfall"
}
)
# General rainfall description document with more detailed numerical data
general_rainfall_doc = Document(
page_content="""
MOSDAC Rainfall Data Overview for India
NUMERICAL DATA TABLES:
Table 1: Monthly Rainfall Data for Different Regions of India (mm) - 2023
Region | June | July | August | September | Total
North India | 158.3 | 284.5 | 262.4 | 120.5 | 825.7
North East India | 385.2 | 512.8 | 485.6 | 337.8 | 1721.4
Central India | 172.8 | 325.6 | 295.4 | 156.3 | 950.1
South Peninsula | 125.4 | 196.8 | 178.5 | 195.6 | 696.3
Western India | 138.7 | 354.2 | 312.8 | 98.5 | 904.2
Table 2: Highest Daily Rainfall Records (mm)
Location | State | Rainfall (mm) | Date
Mawsynram | Meghalaya | 1003.6 | 1985-06-16
Cherrapunji | Meghalaya | 978.3 | 1995-06-12
Mumbai | Maharashtra | 944.2 | 2005-07-26
Dharampur | Gujarat | 823.6 | 2014-07-04
Agumbe | Karnataka | 738.9 | 2019-08-11
RAINFALL DATA:
The Indian monsoon is vital for the country's agriculture and water resources.
India receives about 80% of its annual rainfall during the monsoon season.
Average annual rainfall across India: Approximately 1187 mm.
Regional Variations in Rainfall:
- North East India: Receives highest rainfall, with Mawsynram and Cherrapunji among the wettest places on Earth.
- Western Ghats: High rainfall during monsoon, particularly in coastal Karnataka and Kerala.
- Thar Desert: Lowest rainfall, often less than 300 mm annually.
- Rajasthan: Average annual rainfall around 400-450 mm, highly variable.
- Gangetic Plains: Moderate rainfall, approximately 800-1000 mm annually.
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-overview-india",
"title": "MOSDAC - India Rainfall Data Overview",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, india, precipitation, monsoon, regional rainfall",
"dataType": "rainfall"
}
)
# Add the documents to the vector store
rainfall_docs = [ne_rainfall_doc, north_rainfall_doc, general_rainfall_doc]
vector_store.add_documents(rainfall_docs)
print(f"Added {len(rainfall_docs)} mock rainfall data documents to vector store")
def initialize_vector_store(
crawl: bool = True,
max_pages: int = 100,
max_depth: int = 3,
documents_dir: Optional[str] = None,
index_path: str = "vector_index",
reset: bool = False,
add_mock_data: bool = False
) -> None:
"""
Initialize the vector store by crawling the MOSDAC website and/or
processing local documents
Args:
crawl: Whether to crawl the MOSDAC website
max_pages: Maximum number of pages to crawl
max_depth: Maximum depth to crawl
documents_dir: Directory containing local documents to process
index_path: Path to save the vector index
reset: Whether to reset the existing index
add_mock_data: Whether to add mock data to ensure critical data is available
"""
# Initialize vector store service
vector_store = VectorStoreService(index_path=index_path)
# Reset if requested
if reset:
print("Resetting vector index...")
vector_store.reset_index()
# Always add mock rainfall data if requested
if add_mock_data:
add_mock_rainfall_data(vector_store)
# Crawl the MOSDAC website if requested
if crawl:
print(f"Crawling MOSDAC website (max {max_pages} pages, depth {max_depth})...")
crawler = MOSDACCrawler()
documents = crawler.crawl(max_pages=max_pages, max_depth=max_depth)
if documents:
print(f"Adding {len(documents)} documents from web crawl to vector store...")
vector_store.add_documents(documents)
# Process local documents if a directory is provided
if documents_dir and os.path.isdir(documents_dir):
process_local_documents(documents_dir, vector_store)
print("Vector store initialization complete.")
def process_local_documents(documents_dir: str, vector_store: VectorStoreService) -> None:
"""
Process local documents and add them to the vector store
Args:
documents_dir: Directory containing documents to process
vector_store: Vector store service instance
"""
print(f"Processing local documents in {documents_dir}...")
# Initialize loaders
pdf_loader = PDFLoader()
docx_loader = DOCXLoader()
excel_loader = ExcelLoader()
# Walk through directory
for root, _, files in os.walk(documents_dir):
for file in files:
file_path = os.path.join(root, file)
file_lower = file.lower()
try:
if file_lower.endswith('.pdf'):
documents = pdf_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} chunks from PDF {file} to vector store...")
vector_store.add_documents(documents)
elif file_lower.endswith('.docx'):
documents = docx_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} chunks from DOCX {file} to vector store...")
vector_store.add_documents(documents)
elif file_lower.endswith(('.xlsx', '.xls')):
documents = excel_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} documents from Excel {file} to vector store...")
vector_store.add_documents(documents)
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Initialize the vector store for MOSDAC AI Assistant")
parser.add_argument("--crawl", action="store_true", help="Crawl the MOSDAC website")
parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl")
parser.add_argument("--max-depth", type=int, default=3, help="Maximum depth to crawl")
parser.add_argument("--documents-dir", type=str, help="Directory containing local documents to process")
parser.add_argument("--index-path", type=str, default="vector_index", help="Path to save the vector index")
parser.add_argument("--reset", action="store_true", help="Reset the existing index")
parser.add_argument("--add-mock-data", action="store_true", help="Add mock data to ensure critical data is available")
parser.add_argument("--display", action="store_true", help="Display contents of the vector store")
parser.add_argument("--query", type=str, default="rainfall data", help="Query to search for when displaying contents")
parser.add_argument("--limit", type=int, default=5, help="Maximum number of documents to display")
args = parser.parse_args()
# Initialize vector store
vector_store = VectorStoreService(index_path=args.index_path)
# Process commands
if args.reset:
print("Resetting vector index...")
vector_store.reset_index()
if args.add_mock_data:
add_mock_rainfall_data(vector_store)
if args.crawl:
initialize_vector_store(
crawl=True,
max_pages=args.max_pages,
max_depth=args.max_depth,
documents_dir=args.documents_dir,
index_path=args.index_path,
reset=False, # We've already handled reset
add_mock_data=False # We've already handled mock data
)
# Display contents if requested
if args.display:
display_vector_store_contents(vector_store, args.query, args.limit) |