File size: 4,745 Bytes
4093408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""

One-time setup script to load CSV data and create vector database

Run this only when you have new data or need to rebuild the database

"""

import os
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import pandas as pd

# Load environment variables
load_dotenv()

def fetch_government_schemes():
    """

    Load government schemes from CSV file

    """
    print("Loading government schemes data from CSV...")
    all_schemes = []
    
    try:
        # Load the CSV file
        df = pd.read_csv('updated_data.csv')
        
        print(f"Loaded {len(df)} schemes from CSV")
        
        # Convert DataFrame rows to scheme dictionaries
        for _, row in df.iterrows():
            scheme = {
                "scheme_name": str(row.get('scheme_name', '')).strip(),
                "description": str(row.get('details', '')).strip(),
                "state": str(row.get('level', 'National')).strip(),
                "eligibility": str(row.get('eligibility', '')).strip(),
                "benefits": str(row.get('benefits', '')).strip(),
                "category": str(row.get('schemeCategory', '')).strip(),
                "application_process": str(row.get('application', '')).strip(),
                "documents": str(row.get('documents', '')).strip(),
                "tags": str(row.get('tags', '')).strip()
            }
            
            # Only add schemes with valid data
            if scheme['scheme_name'] and scheme['scheme_name'] != 'nan':
                all_schemes.append(scheme)
        
        print(f"Successfully processed {len(all_schemes)} valid schemes")
        return all_schemes
        
    except FileNotFoundError:
        print("❌ Error: updated_data.csv not found!")
        return []
    except Exception as e:
        print(f"❌ Error loading CSV: {e}")
        return []

def prepare_documents(schemes):
    """

    Convert schemes data into text documents for RAG

    """
    print("Preparing documents...")
    documents = []
    
    for scheme in schemes:
        # Create a comprehensive text representation with all available fields
        doc_text = f"""

Scheme Name: {scheme.get('scheme_name', 'N/A')}

State/Level: {scheme.get('state', 'N/A')}

Category: {scheme.get('category', 'N/A')}

Description: {scheme.get('description', 'N/A')}

Eligibility Criteria: {scheme.get('eligibility', 'N/A')}

Benefits: {scheme.get('benefits', 'N/A')}

Application Process: {scheme.get('application_process', 'N/A')}

Required Documents: {scheme.get('documents', 'N/A')}

Tags: {scheme.get('tags', 'N/A')}

---

        """
        documents.append(doc_text)
    
    return documents

def build_vectorstore(documents):
    """

    Build vector database from documents

    """
    print("\nInitializing embedding model...")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )
    
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=150,
        length_function=len
    )
    
    texts = text_splitter.create_documents(documents)
    
    print(f"Creating vector store with {len(texts)} chunks...")
    print("⏳ This may take a few minutes...")
    
    vectorstore = Chroma.from_documents(
        documents=texts,
        embedding=embeddings,
        persist_directory="./chroma_db"
    )
    
    print("βœ… Vector store created and persisted to ./chroma_db/")
    return vectorstore

def main():
    """

    Main setup function

    """
    print("="*80)
    print("πŸš€ Government Schemes RAG - Database Setup")
    print("="*80)
    print()
    
    # Step 1: Load schemes from CSV
    schemes = fetch_government_schemes()
    
    if not schemes:
        print("\n❌ No schemes loaded. Please check your CSV file.")
        return
    
    # Step 2: Prepare documents
    documents = prepare_documents(schemes)
    
    # Step 3: Build and persist vector store
    vectorstore = build_vectorstore(documents)
    
    print("\n" + "="*80)
    print("βœ… Setup Complete!")
    print("="*80)
    print(f"πŸ“Š Total schemes: {len(schemes)}")
    print(f"πŸ“„ Total documents: {len(documents)}")
    print(f"πŸ’Ύ Vector DB saved to: ./chroma_db/")
    print()
    print("▢️  You can now run the API server with: python app.py")
    print("="*80)

if __name__ == "__main__":
    main()