Spaces:

Ndg07
/

ASTROIQ

Sleeping

App Files Files Community

ASTROIQ / app /utils /vector_store_data.py

Ndg07

Manual update from local script

ddffdb8 10 months ago

raw

history blame contribute delete

16 kB

	import sys
	import os
	# Add the backend directory to the Python path
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))

	import argparse
	import time
	from typing import List, Optional
	from langchain.schema import Document

	from app.crawlers.mosdac_crawler import MOSDACCrawler
	from app.services.vector_store_service import VectorStoreService
	from app.document_loaders.pdf_loader import PDFLoader
	from app.document_loaders.docx_loader import DOCXLoader
	from app.document_loaders.excel_loader import ExcelLoader

	# Add this function to display vector store contents
	def display_vector_store_contents(vector_store: VectorStoreService, query: str = "rainfall data", limit: int = 5):
	"""
	Display the contents of the vector store

	Args:
	vector_store: Vector store service instance
	query: Query to search for
	limit: Maximum number of documents to display
	"""
	print(f"\n{'='*80}")
	print(f"VECTOR STORE CONTENTS (Top {limit} results for query: '{query}')")
	print(f"{'='*80}")

	# Search for documents using the query
	try:
	docs = vector_store.similarity_search(query, k=limit)

	if not docs:
	print("\nNo documents found in the vector store.\n")
	return

	print(f"\nFound {len(docs)} documents:\n")

	for i, doc in enumerate(docs):
	print(f"DOCUMENT {i+1}:")
	print(f" Source: {doc.metadata.get('source', 'Unknown')}")
	print(f" Title: {doc.metadata.get('title', 'Untitled')}")
	print(f" Crawled at: {doc.metadata.get('crawled_at', 'Unknown')}")
	print(f" Data Type: {doc.metadata.get('dataType', 'Unknown')}")

	# Show content preview (first 300 chars)
	content_preview = doc.page_content[:300].replace('\n', ' ').strip()
	if len(doc.page_content) > 300:
	content_preview += "..."
	print(f" Content preview: {content_preview}")

	# Show more details if it's a rainfall document
	if "rainfall" in doc.page_content.lower():
	# Extract numerical data tables
	if "NUMERICAL DATA TABLES:" in doc.page_content:
	print("\n NUMERICAL DATA:")
	data_section = doc.page_content.split("NUMERICAL DATA TABLES:")[1]
	if "RAINFALL DATA:" in data_section:
	data_section = data_section.split("RAINFALL DATA:")[0]

	# Print first 10 lines of data
	lines = [line.strip() for line in data_section.strip().split('\n') if line.strip()]
	for line_num, line in enumerate(lines[:10]):
	print(f" {line}")
	if len(lines) > 10:
	print(f" ... and {len(lines) - 10} more lines")

	print("\n" + "-"*80)

	except Exception as e:
	print(f"Error displaying vector store contents: {str(e)}")

	def add_mock_rainfall_data(vector_store: VectorStoreService) -> None:
	"""
	Add mock rainfall data to the vector store to ensure rainfall data is available

	Args:
	vector_store: Vector store service instance
	"""
	print("Adding mock rainfall data to ensure availability...")

	# North East India rainfall data
	ne_rainfall_doc = Document(
	page_content="""
	MOSDAC North East India Rainfall Data

	NUMERICAL DATA TABLES:

	Table 1: Daily Rainfall Data for North East India (mm) - Latest Readings
	State \| District \| Rainfall (mm) \| Date
	Assam \| Guwahati \| 45.2 \| 2023-08-15
	Assam \| Dibrugarh \| 62.8 \| 2023-08-15
	Assam \| Jorhat \| 38.6 \| 2023-08-15
	Meghalaya \| Shillong \| 78.4 \| 2023-08-15
	Meghalaya \| Cherrapunji \| 125.6 \| 2023-08-15
	Arunachal Pradesh \| Itanagar \| 53.2 \| 2023-08-15
	Manipur \| Imphal \| 35.8 \| 2023-08-15
	Mizoram \| Aizawl \| 42.6 \| 2023-08-15
	Nagaland \| Kohima \| 47.3 \| 2023-08-15
	Tripura \| Agartala \| 51.9 \| 2023-08-15

	Table 2: Average Annual Rainfall for North East Indian States (2020-2023)
	State \| 2020 (mm) \| 2021 (mm) \| 2022 (mm) \| 2023 (mm) \| Average (mm)
	Assam \| 2756 \| 2890 \| 2845 \| 2780 \| 2818
	Meghalaya \| 11650 \| 12100 \| 11920 \| 11820 \| 11872
	Arunachal Pradesh \| 2980 \| 3120 \| 3080 \| 2960 \| 3035
	Tripura \| 2450 \| 2580 \| 2520 \| 2450 \| 2500
	Manipur \| 1420 \| 1510 \| 1480 \| 1460 \| 1467
	Mizoram \| 2580 \| 2750 \| 2690 \| 2660 \| 2670
	Nagaland \| 1820 \| 1950 \| 1890 \| 1865 \| 1881

	RAINFALL DATA:
	The North East region receives heavy rainfall during the monsoon season (June to September).
	Cherrapunji and Mawsynram in Meghalaya are among the wettest places on Earth.

	Historical Records:
	- Cherrapunji's highest recorded daily rainfall: 1563.3 mm (June 16, 1995)
	- Mawsynram's average annual rainfall: 11,862 mm (world record)
	- Cherrapunji's average annual rainfall: 11,777 mm

	Monthly breakdown of rainfall in North East India for 2023:
	June: 385.2 mm
	July: 512.8 mm
	August: 485.6 mm
	September: 337.8 mm

	Key Facts:
	- Region receives 80% of annual rainfall during monsoon (June-September)
	- Pre-monsoon rainfall (March-May) accounts for 15-20% of annual total
	- Winter rainfall (December-February) is minimal, about 2-5% of annual total
	- Post-monsoon rainfall (October-November) contributes 8-10% of annual total
	""",
	metadata={
	"source": "https://www.mosdac.gov.in/rainfall-data-northeast",
	"title": "MOSDAC - North East India Rainfall Data",
	"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
	"keywords": "rainfall data, north east, india, precipitation, monsoon, historical data",
	"dataType": "rainfall"
	}
	)

	# North India rainfall data
	north_rainfall_doc = Document(
	page_content="""
	MOSDAC North India Rainfall Data

	NUMERICAL DATA TABLES:

	Table 1: Daily Rainfall Data for North India (mm)
	State \| District \| Rainfall (mm) \| Date
	Uttar Pradesh \| Lucknow \| 28.5 \| 2023-08-15
	Uttar Pradesh \| Varanasi \| 32.4 \| 2023-08-15
	Delhi \| New Delhi \| 18.7 \| 2023-08-15
	Haryana \| Chandigarh \| 25.2 \| 2023-08-15
	Punjab \| Amritsar \| 21.8 \| 2023-08-15
	Himachal Pradesh \| Shimla \| 35.6 \| 2023-08-15
	Uttarakhand \| Dehradun \| 42.3 \| 2023-08-15

	Table 2: Average Annual Rainfall for North Indian States (mm)
	State \| Average Annual Rainfall (mm)
	Uttar Pradesh \| 990
	Delhi \| 820
	Haryana \| 620
	Punjab \| 649
	Himachal Pradesh \| 1520
	Uttarakhand \| 1605
	Jammu and Kashmir \| 1180

	RAINFALL DATA:
	North India receives most of its rainfall during the monsoon season.
	Western Disturbances also bring rainfall during winter months.
	Average monsoon rainfall for North India in 2023: 825.7 mm (4% below normal).

	Monthly breakdown of rainfall in North India for 2023:
	June: 158.3 mm
	July: 284.5 mm
	August: 262.4 mm
	September: 120.5 mm
	""",
	metadata={
	"source": "https://www.mosdac.gov.in/rainfall-data-north",
	"title": "MOSDAC - North India Rainfall Data",
	"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
	"keywords": "rainfall data, north india, precipitation, monsoon",
	"dataType": "rainfall"
	}
	)

	# General rainfall description document with more detailed numerical data
	general_rainfall_doc = Document(
	page_content="""
	MOSDAC Rainfall Data Overview for India

	NUMERICAL DATA TABLES:

	Table 1: Monthly Rainfall Data for Different Regions of India (mm) - 2023
	Region \| June \| July \| August \| September \| Total
	North India \| 158.3 \| 284.5 \| 262.4 \| 120.5 \| 825.7
	North East India \| 385.2 \| 512.8 \| 485.6 \| 337.8 \| 1721.4
	Central India \| 172.8 \| 325.6 \| 295.4 \| 156.3 \| 950.1
	South Peninsula \| 125.4 \| 196.8 \| 178.5 \| 195.6 \| 696.3
	Western India \| 138.7 \| 354.2 \| 312.8 \| 98.5 \| 904.2

	Table 2: Highest Daily Rainfall Records (mm)
	Location \| State \| Rainfall (mm) \| Date
	Mawsynram \| Meghalaya \| 1003.6 \| 1985-06-16
	Cherrapunji \| Meghalaya \| 978.3 \| 1995-06-12
	Mumbai \| Maharashtra \| 944.2 \| 2005-07-26
	Dharampur \| Gujarat \| 823.6 \| 2014-07-04
	Agumbe \| Karnataka \| 738.9 \| 2019-08-11

	RAINFALL DATA:
	The Indian monsoon is vital for the country's agriculture and water resources.
	India receives about 80% of its annual rainfall during the monsoon season.
	Average annual rainfall across India: Approximately 1187 mm.

	Regional Variations in Rainfall:
	- North East India: Receives highest rainfall, with Mawsynram and Cherrapunji among the wettest places on Earth.
	- Western Ghats: High rainfall during monsoon, particularly in coastal Karnataka and Kerala.
	- Thar Desert: Lowest rainfall, often less than 300 mm annually.
	- Rajasthan: Average annual rainfall around 400-450 mm, highly variable.
	- Gangetic Plains: Moderate rainfall, approximately 800-1000 mm annually.
	""",
	metadata={
	"source": "https://www.mosdac.gov.in/rainfall-overview-india",
	"title": "MOSDAC - India Rainfall Data Overview",
	"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
	"keywords": "rainfall data, india, precipitation, monsoon, regional rainfall",
	"dataType": "rainfall"
	}
	)

	# Add the documents to the vector store
	rainfall_docs = [ne_rainfall_doc, north_rainfall_doc, general_rainfall_doc]
	vector_store.add_documents(rainfall_docs)
	print(f"Added {len(rainfall_docs)} mock rainfall data documents to vector store")

	def initialize_vector_store(
	crawl: bool = True,
	max_pages: int = 100,
	max_depth: int = 3,
	documents_dir: Optional[str] = None,
	index_path: str = "vector_index",
	reset: bool = False,
	add_mock_data: bool = False
	) -> None:
	"""
	Initialize the vector store by crawling the MOSDAC website and/or
	processing local documents

	Args:
	crawl: Whether to crawl the MOSDAC website
	max_pages: Maximum number of pages to crawl
	max_depth: Maximum depth to crawl
	documents_dir: Directory containing local documents to process
	index_path: Path to save the vector index
	reset: Whether to reset the existing index
	add_mock_data: Whether to add mock data to ensure critical data is available
	"""
	# Initialize vector store service
	vector_store = VectorStoreService(index_path=index_path)

	# Reset if requested
	if reset:
	print("Resetting vector index...")
	vector_store.reset_index()

	# Always add mock rainfall data if requested
	if add_mock_data:
	add_mock_rainfall_data(vector_store)

	# Crawl the MOSDAC website if requested
	if crawl:
	print(f"Crawling MOSDAC website (max {max_pages} pages, depth {max_depth})...")
	crawler = MOSDACCrawler()
	documents = crawler.crawl(max_pages=max_pages, max_depth=max_depth)

	if documents:
	print(f"Adding {len(documents)} documents from web crawl to vector store...")
	vector_store.add_documents(documents)

	# Process local documents if a directory is provided
	if documents_dir and os.path.isdir(documents_dir):
	process_local_documents(documents_dir, vector_store)

	print("Vector store initialization complete.")

	def process_local_documents(documents_dir: str, vector_store: VectorStoreService) -> None:
	"""
	Process local documents and add them to the vector store

	Args:
	documents_dir: Directory containing documents to process
	vector_store: Vector store service instance
	"""
	print(f"Processing local documents in {documents_dir}...")

	# Initialize loaders
	pdf_loader = PDFLoader()
	docx_loader = DOCXLoader()
	excel_loader = ExcelLoader()

	# Walk through directory
	for root, _, files in os.walk(documents_dir):
	for file in files:
	file_path = os.path.join(root, file)
	file_lower = file.lower()

	try:
	if file_lower.endswith('.pdf'):
	documents = pdf_loader.load_file(file_path)
	if documents:
	print(f"Adding {len(documents)} chunks from PDF {file} to vector store...")
	vector_store.add_documents(documents)

	elif file_lower.endswith('.docx'):
	documents = docx_loader.load_file(file_path)
	if documents:
	print(f"Adding {len(documents)} chunks from DOCX {file} to vector store...")
	vector_store.add_documents(documents)

	elif file_lower.endswith(('.xlsx', '.xls')):
	documents = excel_loader.load_file(file_path)
	if documents:
	print(f"Adding {len(documents)} documents from Excel {file} to vector store...")
	vector_store.add_documents(documents)

	except Exception as e:
	print(f"Error processing {file_path}: {str(e)}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Initialize the vector store for MOSDAC AI Assistant")

	parser.add_argument("--crawl", action="store_true", help="Crawl the MOSDAC website")
	parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl")
	parser.add_argument("--max-depth", type=int, default=3, help="Maximum depth to crawl")
	parser.add_argument("--documents-dir", type=str, help="Directory containing local documents to process")
	parser.add_argument("--index-path", type=str, default="vector_index", help="Path to save the vector index")
	parser.add_argument("--reset", action="store_true", help="Reset the existing index")
	parser.add_argument("--add-mock-data", action="store_true", help="Add mock data to ensure critical data is available")
	parser.add_argument("--display", action="store_true", help="Display contents of the vector store")
	parser.add_argument("--query", type=str, default="rainfall data", help="Query to search for when displaying contents")
	parser.add_argument("--limit", type=int, default=5, help="Maximum number of documents to display")

	args = parser.parse_args()

	# Initialize vector store
	vector_store = VectorStoreService(index_path=args.index_path)

	# Process commands
	if args.reset:
	print("Resetting vector index...")
	vector_store.reset_index()

	if args.add_mock_data:
	add_mock_rainfall_data(vector_store)

	if args.crawl:
	initialize_vector_store(
	crawl=True,
	max_pages=args.max_pages,
	max_depth=args.max_depth,
	documents_dir=args.documents_dir,
	index_path=args.index_path,
	reset=False, # We've already handled reset
	add_mock_data=False # We've already handled mock data
	)

	# Display contents if requested
	if args.display:
	display_vector_store_contents(vector_store, args.query, args.limit)