Spaces:
Sleeping
Sleeping
| """ | |
| One-time setup script to load CSV data and create vector database | |
| Run this only when you have new data or need to rebuild the database | |
| """ | |
| import os | |
| from dotenv import load_dotenv | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| import pandas as pd | |
| # Load environment variables | |
| load_dotenv() | |
| def fetch_government_schemes(): | |
| """ | |
| Load government schemes from CSV file | |
| """ | |
| print("Loading government schemes data from CSV...") | |
| all_schemes = [] | |
| try: | |
| # Load the CSV file | |
| df = pd.read_csv('updated_data.csv') | |
| print(f"Loaded {len(df)} schemes from CSV") | |
| # Convert DataFrame rows to scheme dictionaries | |
| for _, row in df.iterrows(): | |
| scheme = { | |
| "scheme_name": str(row.get('scheme_name', '')).strip(), | |
| "description": str(row.get('details', '')).strip(), | |
| "state": str(row.get('level', 'National')).strip(), | |
| "eligibility": str(row.get('eligibility', '')).strip(), | |
| "benefits": str(row.get('benefits', '')).strip(), | |
| "category": str(row.get('schemeCategory', '')).strip(), | |
| "application_process": str(row.get('application', '')).strip(), | |
| "documents": str(row.get('documents', '')).strip(), | |
| "tags": str(row.get('tags', '')).strip() | |
| } | |
| # Only add schemes with valid data | |
| if scheme['scheme_name'] and scheme['scheme_name'] != 'nan': | |
| all_schemes.append(scheme) | |
| print(f"Successfully processed {len(all_schemes)} valid schemes") | |
| return all_schemes | |
| except FileNotFoundError: | |
| print("β Error: updated_data.csv not found!") | |
| return [] | |
| except Exception as e: | |
| print(f"β Error loading CSV: {e}") | |
| return [] | |
| def prepare_documents(schemes): | |
| """ | |
| Convert schemes data into text documents for RAG | |
| """ | |
| print("Preparing documents...") | |
| documents = [] | |
| for scheme in schemes: | |
| # Create a comprehensive text representation with all available fields | |
| doc_text = f""" | |
| Scheme Name: {scheme.get('scheme_name', 'N/A')} | |
| State/Level: {scheme.get('state', 'N/A')} | |
| Category: {scheme.get('category', 'N/A')} | |
| Description: {scheme.get('description', 'N/A')} | |
| Eligibility Criteria: {scheme.get('eligibility', 'N/A')} | |
| Benefits: {scheme.get('benefits', 'N/A')} | |
| Application Process: {scheme.get('application_process', 'N/A')} | |
| Required Documents: {scheme.get('documents', 'N/A')} | |
| Tags: {scheme.get('tags', 'N/A')} | |
| --- | |
| """ | |
| documents.append(doc_text) | |
| return documents | |
| def build_vectorstore(documents): | |
| """ | |
| Build vector database from documents | |
| """ | |
| print("\nInitializing embedding model...") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={'device': 'cpu'} | |
| ) | |
| print("Splitting documents into chunks...") | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1500, | |
| chunk_overlap=150, | |
| length_function=len | |
| ) | |
| texts = text_splitter.create_documents(documents) | |
| print(f"Creating vector store with {len(texts)} chunks...") | |
| print("β³ This may take a few minutes...") | |
| vectorstore = Chroma.from_documents( | |
| documents=texts, | |
| embedding=embeddings, | |
| persist_directory="./chroma_db" | |
| ) | |
| print("β Vector store created and persisted to ./chroma_db/") | |
| return vectorstore | |
| def main(): | |
| """ | |
| Main setup function | |
| """ | |
| print("="*80) | |
| print("π Government Schemes RAG - Database Setup") | |
| print("="*80) | |
| print() | |
| # Step 1: Load schemes from CSV | |
| schemes = fetch_government_schemes() | |
| if not schemes: | |
| print("\nβ No schemes loaded. Please check your CSV file.") | |
| return | |
| # Step 2: Prepare documents | |
| documents = prepare_documents(schemes) | |
| # Step 3: Build and persist vector store | |
| vectorstore = build_vectorstore(documents) | |
| print("\n" + "="*80) | |
| print("β Setup Complete!") | |
| print("="*80) | |
| print(f"π Total schemes: {len(schemes)}") | |
| print(f"π Total documents: {len(documents)}") | |
| print(f"πΎ Vector DB saved to: ./chroma_db/") | |
| print() | |
| print("βΆοΈ You can now run the API server with: python app.py") | |
| print("="*80) | |
| if __name__ == "__main__": | |
| main() | |