File size: 11,623 Bytes
ddffdb8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 | import os
import argparse
import time
from typing import List, Optional
from langchain.schema import Document
from app.crawlers.mosdac_crawler import MOSDACCrawler
from app.services.vector_store_service import VectorStoreService
from app.document_loaders.pdf_loader import PDFLoader
from app.document_loaders.docx_loader import DOCXLoader
from app.document_loaders.excel_loader import ExcelLoader
def add_mock_rainfall_data(vector_store: VectorStoreService) -> None:
"""
Add mock rainfall data to the vector store to ensure rainfall data is available
Args:
vector_store: Vector store service instance
"""
print("Adding mock rainfall data to ensure availability...")
# North East India rainfall data
ne_rainfall_doc = Document(
page_content="""
MOSDAC North East India Rainfall Data
NUMERICAL DATA TABLES:
Table 1: Daily Rainfall Data for North East India (mm)
State | District | Rainfall (mm) | Date
Assam | Guwahati | 45.2 | 2023-08-15
Assam | Dibrugarh | 62.8 | 2023-08-15
Assam | Jorhat | 38.6 | 2023-08-15
Meghalaya | Shillong | 78.4 | 2023-08-15
Meghalaya | Cherrapunji | 125.6 | 2023-08-15
Arunachal Pradesh | Itanagar | 53.2 | 2023-08-15
Manipur | Imphal | 35.8 | 2023-08-15
Mizoram | Aizawl | 42.6 | 2023-08-15
Nagaland | Kohima | 47.3 | 2023-08-15
Tripura | Agartala | 51.9 | 2023-08-15
Table 2: Average Annual Rainfall for North East Indian States (mm)
State | Average Annual Rainfall (mm)
Assam | 2818
Meghalaya | 11872
Arunachal Pradesh | 3035
Tripura | 2500
Manipur | 1467
Mizoram | 2670
Nagaland | 1881
RAINFALL DATA:
The North East region receives heavy rainfall during the monsoon season (June to September).
Cherrapunji and Mawsynram in Meghalaya are among the wettest places on Earth.
Average monsoon rainfall for North East India in 2023: 1721.4 mm (8% above normal).
Monthly breakdown of rainfall in North East India for 2023:
June: 385.2 mm
July: 512.8 mm
August: 485.6 mm
September: 337.8 mm
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-data-northeast",
"title": "MOSDAC - North East India Rainfall Data",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, north east, india, precipitation, monsoon",
"dataType": "rainfall"
}
)
# North India rainfall data
north_rainfall_doc = Document(
page_content="""
MOSDAC North India Rainfall Data
NUMERICAL DATA TABLES:
Table 1: Daily Rainfall Data for North India (mm)
State | District | Rainfall (mm) | Date
Uttar Pradesh | Lucknow | 28.5 | 2023-08-15
Uttar Pradesh | Varanasi | 32.4 | 2023-08-15
Delhi | New Delhi | 18.7 | 2023-08-15
Haryana | Chandigarh | 25.2 | 2023-08-15
Punjab | Amritsar | 21.8 | 2023-08-15
Himachal Pradesh | Shimla | 35.6 | 2023-08-15
Uttarakhand | Dehradun | 42.3 | 2023-08-15
Table 2: Average Annual Rainfall for North Indian States (mm)
State | Average Annual Rainfall (mm)
Uttar Pradesh | 990
Delhi | 820
Haryana | 620
Punjab | 649
Himachal Pradesh | 1520
Uttarakhand | 1605
Jammu and Kashmir | 1180
RAINFALL DATA:
North India receives most of its rainfall during the monsoon season.
Western Disturbances also bring rainfall during winter months.
Average monsoon rainfall for North India in 2023: 825.7 mm (4% below normal).
Monthly breakdown of rainfall in North India for 2023:
June: 158.3 mm
July: 284.5 mm
August: 262.4 mm
September: 120.5 mm
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-data-north",
"title": "MOSDAC - North India Rainfall Data",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, north india, precipitation, monsoon",
"dataType": "rainfall"
}
)
# General rainfall description document with more detailed numerical data
general_rainfall_doc = Document(
page_content="""
MOSDAC Rainfall Data Overview for India
NUMERICAL DATA TABLES:
Table 1: Monthly Rainfall Data for Different Regions of India (mm) - 2023
Region | June | July | August | September | Total
North India | 158.3 | 284.5 | 262.4 | 120.5 | 825.7
North East India | 385.2 | 512.8 | 485.6 | 337.8 | 1721.4
Central India | 172.8 | 325.6 | 295.4 | 156.3 | 950.1
South Peninsula | 125.4 | 196.8 | 178.5 | 195.6 | 696.3
Western India | 138.7 | 354.2 | 312.8 | 98.5 | 904.2
Table 2: Highest Daily Rainfall Records (mm)
Location | State | Rainfall (mm) | Date
Mawsynram | Meghalaya | 1003.6 | 1985-06-16
Cherrapunji | Meghalaya | 978.3 | 1995-06-12
Mumbai | Maharashtra | 944.2 | 2005-07-26
Dharampur | Gujarat | 823.6 | 2014-07-04
Agumbe | Karnataka | 738.9 | 2019-08-11
RAINFALL DATA:
The Indian monsoon is vital for the country's agriculture and water resources.
India receives about 80% of its annual rainfall during the monsoon season.
Average annual rainfall across India: Approximately 1187 mm.
Regional Variations in Rainfall:
- North East India: Receives highest rainfall, with Mawsynram and Cherrapunji among the wettest places on Earth.
- Western Ghats: High rainfall during monsoon, particularly in coastal Karnataka and Kerala.
- Thar Desert: Lowest rainfall, often less than 300 mm annually.
- Rajasthan: Average annual rainfall around 400-450 mm, highly variable.
- Gangetic Plains: Moderate rainfall, approximately 800-1000 mm annually.
""",
metadata={
"source": "https://www.mosdac.gov.in/rainfall-overview-india",
"title": "MOSDAC - India Rainfall Data Overview",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"keywords": "rainfall data, india, precipitation, monsoon, regional rainfall",
"dataType": "rainfall"
}
)
# Add the documents to the vector store
rainfall_docs = [ne_rainfall_doc, north_rainfall_doc, general_rainfall_doc]
vector_store.add_documents(rainfall_docs)
print(f"Added {len(rainfall_docs)} mock rainfall data documents to vector store")
def initialize_vector_store(
crawl: bool = True,
max_pages: int = 100,
max_depth: int = 3,
documents_dir: Optional[str] = None,
index_path: str = "vector_index",
reset: bool = False,
add_mock_data: bool = False
) -> None:
"""
Initialize the vector store by crawling the MOSDAC website and/or
processing local documents
Args:
crawl: Whether to crawl the MOSDAC website
max_pages: Maximum number of pages to crawl
max_depth: Maximum depth to crawl
documents_dir: Directory containing local documents to process
index_path: Path to save the vector index
reset: Whether to reset the existing index
add_mock_data: Whether to add mock data to ensure critical data is available
"""
# Initialize vector store service
vector_store = VectorStoreService(index_path=index_path)
# Reset if requested
if reset:
print("Resetting vector index...")
vector_store.reset_index()
# Always add mock rainfall data if requested
if add_mock_data:
add_mock_rainfall_data(vector_store)
# Crawl the MOSDAC website if requested
if crawl:
print(f"Crawling MOSDAC website (max {max_pages} pages, depth {max_depth})...")
crawler = MOSDACCrawler()
documents = crawler.crawl(max_pages=max_pages, max_depth=max_depth)
if documents:
print(f"Adding {len(documents)} documents from web crawl to vector store...")
vector_store.add_documents(documents)
# Process local documents if a directory is provided
if documents_dir and os.path.isdir(documents_dir):
process_local_documents(documents_dir, vector_store)
print("Vector store initialization complete.")
def process_local_documents(documents_dir: str, vector_store: VectorStoreService) -> None:
"""
Process local documents and add them to the vector store
Args:
documents_dir: Directory containing documents to process
vector_store: Vector store service instance
"""
print(f"Processing local documents in {documents_dir}...")
# Initialize loaders
pdf_loader = PDFLoader()
docx_loader = DOCXLoader()
excel_loader = ExcelLoader()
# Walk through directory
for root, _, files in os.walk(documents_dir):
for file in files:
file_path = os.path.join(root, file)
file_lower = file.lower()
try:
if file_lower.endswith('.pdf'):
documents = pdf_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} chunks from PDF {file} to vector store...")
vector_store.add_documents(documents)
elif file_lower.endswith('.docx'):
documents = docx_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} chunks from DOCX {file} to vector store...")
vector_store.add_documents(documents)
elif file_lower.endswith(('.xlsx', '.xls')):
documents = excel_loader.load_file(file_path)
if documents:
print(f"Adding {len(documents)} documents from Excel {file} to vector store...")
vector_store.add_documents(documents)
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Initialize the vector store for MOSDAC AI Assistant")
parser.add_argument("--crawl", action="store_true", help="Crawl the MOSDAC website")
parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl")
parser.add_argument("--max-depth", type=int, default=3, help="Maximum depth to crawl")
parser.add_argument("--documents-dir", type=str, help="Directory containing local documents to process")
parser.add_argument("--index-path", type=str, default="vector_index", help="Path to save the vector index")
parser.add_argument("--reset", action="store_true", help="Reset the existing index")
parser.add_argument("--add-mock-data", action="store_true", help="Add mock data to ensure critical data is available")
args = parser.parse_args()
initialize_vector_store(
crawl=args.crawl,
max_pages=args.max_pages,
max_depth=args.max_depth,
documents_dir=args.documents_dir,
index_path=args.index_path,
reset=args.reset,
add_mock_data=args.add_mock_data
) |