Spaces:

suryateja008
/

SchemeSync

Sleeping

App Files Files Community

SchemeSync / setup_db.py

suryateja008

Upload 22 files

4093408 verified 2 months ago

raw

history blame contribute delete

4.75 kB

	"""
	One-time setup script to load CSV data and create vector database
	Run this only when you have new data or need to rebuild the database
	"""

	import os
	from dotenv import load_dotenv
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain_community.embeddings import HuggingFaceEmbeddings
	import pandas as pd

	# Load environment variables
	load_dotenv()

	def fetch_government_schemes():
	"""
	Load government schemes from CSV file
	"""
	print("Loading government schemes data from CSV...")
	all_schemes = []

	try:
	# Load the CSV file
	df = pd.read_csv('updated_data.csv')

	print(f"Loaded {len(df)} schemes from CSV")

	# Convert DataFrame rows to scheme dictionaries
	for _, row in df.iterrows():
	scheme = {
	"scheme_name": str(row.get('scheme_name', '')).strip(),
	"description": str(row.get('details', '')).strip(),
	"state": str(row.get('level', 'National')).strip(),
	"eligibility": str(row.get('eligibility', '')).strip(),
	"benefits": str(row.get('benefits', '')).strip(),
	"category": str(row.get('schemeCategory', '')).strip(),
	"application_process": str(row.get('application', '')).strip(),
	"documents": str(row.get('documents', '')).strip(),
	"tags": str(row.get('tags', '')).strip()
	}

	# Only add schemes with valid data
	if scheme['scheme_name'] and scheme['scheme_name'] != 'nan':
	all_schemes.append(scheme)

	print(f"Successfully processed {len(all_schemes)} valid schemes")
	return all_schemes

	except FileNotFoundError:
	print("❌ Error: updated_data.csv not found!")
	return []
	except Exception as e:
	print(f"❌ Error loading CSV: {e}")
	return []

	def prepare_documents(schemes):
	"""
	Convert schemes data into text documents for RAG
	"""
	print("Preparing documents...")
	documents = []

	for scheme in schemes:
	# Create a comprehensive text representation with all available fields
	doc_text = f"""
	Scheme Name: {scheme.get('scheme_name', 'N/A')}
	State/Level: {scheme.get('state', 'N/A')}
	Category: {scheme.get('category', 'N/A')}
	Description: {scheme.get('description', 'N/A')}
	Eligibility Criteria: {scheme.get('eligibility', 'N/A')}
	Benefits: {scheme.get('benefits', 'N/A')}
	Application Process: {scheme.get('application_process', 'N/A')}
	Required Documents: {scheme.get('documents', 'N/A')}
	Tags: {scheme.get('tags', 'N/A')}
	---
	"""
	documents.append(doc_text)

	return documents

	def build_vectorstore(documents):
	"""
	Build vector database from documents
	"""
	print("\nInitializing embedding model...")
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cpu'}
	)

	print("Splitting documents into chunks...")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1500,
	chunk_overlap=150,
	length_function=len
	)

	texts = text_splitter.create_documents(documents)

	print(f"Creating vector store with {len(texts)} chunks...")
	print("⏳ This may take a few minutes...")

	vectorstore = Chroma.from_documents(
	documents=texts,
	embedding=embeddings,
	persist_directory="./chroma_db"
	)

	print("✅ Vector store created and persisted to ./chroma_db/")
	return vectorstore

	def main():
	"""
	Main setup function
	"""
	print("="*80)
	print("🚀 Government Schemes RAG - Database Setup")
	print("="*80)
	print()

	# Step 1: Load schemes from CSV
	schemes = fetch_government_schemes()

	if not schemes:
	print("\n❌ No schemes loaded. Please check your CSV file.")
	return

	# Step 2: Prepare documents
	documents = prepare_documents(schemes)

	# Step 3: Build and persist vector store
	vectorstore = build_vectorstore(documents)

	print("\n" + "="*80)
	print("✅ Setup Complete!")
	print("="*80)
	print(f"📊 Total schemes: {len(schemes)}")
	print(f"📄 Total documents: {len(documents)}")
	print(f"💾 Vector DB saved to: ./chroma_db/")
	print()
	print("▶️ You can now run the API server with: python app.py")
	print("="*80)

	if __name__ == "__main__":
	main()