hardkpentium101 commited on
Commit
2e82da7
·
1 Parent(s): f0ecb68

merge local branch

Browse files
.env.example ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qdrant Cloud Configuration (required)
2
+ QDRANT_URL=https://your-cluster.qdrant.tech
3
+ QDRANT_API_KEY=your-api-key
4
+
5
+ # HuggingFace Token (optional but recommended for faster downloads)
6
+ HF_TOKEN=your-huggingface-token
7
+
8
+ # LLM Provider (default: huggingface)
9
+ LLM_PROVIDER=huggingface
10
+
11
+ # HuggingFace Model (default: Sarvam-1 for Indic languages)
12
+ HUGGINGFACE_MODEL=sarvamai/sarvam-1
13
+
14
+ # Generation Parameters
15
+ TEMPERATURE=0.7
16
+ MAX_NEW_TOKENS=1024
17
+ REPETITION_PENALTY=1.1
18
+ TOP_P=0.9
19
+ TOP_K=50
20
+
21
+ # Backend URL for frontend
22
+ BACKEND_URL=http://localhost:8000
.gitignore ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Environment
27
+ .env
28
+ .env.local
29
+
30
+ # IDE
31
+ .idea/
32
+ .vscode/
33
+ *.swp
34
+ *.swo
35
+
36
+ # OS
37
+ .DS_Store
38
+ Thumbs.db
39
+
40
+ # Qdrant
41
+ qdrant_storage/
42
+
43
+ # Logs
44
+ *.log
45
+
46
+ # Model cache
47
+ .cache/
48
+ models/
49
+
50
+ # Dataset hashes
51
+ hf_datasets_hashes.json
README.md CHANGED
@@ -1,10 +1,245 @@
1
  ---
2
- title: IndicRAG
3
- emoji: 🏃
4
- colorFrom: blue
5
- colorTo: pink
6
  sdk: docker
 
 
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: HindiRAG
3
+ emoji: 💻
4
+ colorFrom: pink
5
+ colorTo: purple
6
  sdk: docker
7
+ app_port: 7860
8
+ sdk_version: latest
9
+ app_file: Dockerfile
10
  pinned: false
11
  ---
12
 
13
+
14
+ # Simple RAG
15
+
16
+ A simplified Retrieval-Augmented Generation (RAG) system for Indic languages using Sarvam-1 model and Qdrant Cloud.
17
+
18
+ ## Features
19
+
20
+ - **10 Indic Languages Support**: Hindi, Bengali, Gujarati, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu
21
+ - **User-Selected Language**: Choose your preferred language from the UI
22
+ - **Sarvam-1 Model**: Optimized for Indic language generation
23
+ - **Qdrant Cloud**: Managed vector database service
24
+ - **Simple UI**: Clean interface with just query input and response area
25
+ - **No Language Detection**: Direct query processing without auto-detection
26
+ - **HuggingFace Datasets**: Load data directly from HuggingFace (no local files needed)
27
+
28
+ ## Supported Languages
29
+
30
+ | Language | Native Name | Code |
31
+ |----------|-------------|------|
32
+ | Hindi | हिंदी | hi |
33
+ | Bengali | বাংলা | bn |
34
+ | Gujarati | ગુજરાતી | gu |
35
+ | Kannada | ಕನ್ನಡ | kn |
36
+ | Malayalam | മലയാളം | ml |
37
+ | Marathi | मराठी | mr |
38
+ | Odia | ଓଡ଼ିଆ | or |
39
+ | Punjabi | ਪੰਜਾਬੀ | pa |
40
+ | Tamil | தமிழ் | ta |
41
+ | Telugu | తెలుగు | te |
42
+
43
+ ## Quick Start
44
+
45
+ ### 1. Clone and Setup
46
+
47
+ ```bash
48
+ cd simple-rag
49
+ ```
50
+
51
+ ### 2. Configure Environment
52
+
53
+ ```bash
54
+ cp .env.example .env
55
+ ```
56
+
57
+ Edit `.env` and add your Qdrant Cloud credentials:
58
+ ```bash
59
+ QDRANT_URL=https://your-cluster.qdrant.tech
60
+ QDRANT_API_KEY=your-api-key
61
+ ```
62
+
63
+ ### 3. Install Dependencies
64
+
65
+ ```bash
66
+ # Backend
67
+ cd backend
68
+ pip install -r requirements.txt
69
+
70
+ # Frontend (in a new terminal)
71
+ cd ../frontend
72
+ pip install -r requirements.txt
73
+ ```
74
+
75
+ ### 4. Load Data to Qdrant (Optional)
76
+
77
+ If your Qdrant collection is empty, you can load data from HuggingFace datasets:
78
+
79
+ ```bash
80
+ cd backend
81
+
82
+ # Set the datasets you want to load (MIRACL corpus for all 10 Indic languages)
83
+ export HF_DATASETS=miracl/miracl-corpus:hi:train,miracl/miracl-corpus:bn:train,miracl/miracl-corpus:ta:train,miracl/miracl-corpus:te:train,miracl/miracl-corpus:mr:train,miracl/miracl-corpus:gu:train,miracl/miracl-corpus:kn:train,miracl/miracl-corpus:ml:train,miracl/miracl-corpus:or:train,miracl/miracl-corpus:pa:train
84
+
85
+ # Run the data loader
86
+ python src/load_huggingface_dataset.py
87
+ ```
88
+
89
+ ### 5. Run the Application
90
+
91
+ ```bash
92
+ # Terminal 1: Start Backend
93
+ cd backend
94
+ python main.py
95
+
96
+ # Terminal 2: Start Frontend
97
+ cd frontend
98
+ streamlit run app.py
99
+ ```
100
+
101
+ Then open http://localhost:8501 in your browser.
102
+
103
+ ## Architecture
104
+
105
+ ```
106
+ ┌─────────────────┐ ┌──────────────┐ ┌─────────────────┐
107
+ │ User Query │────▶│ Qdrant Cloud│────▶│ Sarvam-1 LLM │
108
+ │ (Selected Lang)│ │ Vector DB │ │ (Generation) │
109
+ └─────────────────┘ └──────────────┘ └─────────────────┘
110
+ │ │
111
+ │ │
112
+ ▼ ▼
113
+ ┌─────────────────┐ ┌─────────────────┐
114
+ │ Streamlit UI │ │ Embedding │
115
+ │ (Frontend) │ │ Generator │
116
+ └─────────────────┘ └─────────────────┘
117
+ ```
118
+
119
+ ## Project Structure
120
+
121
+ ```
122
+ simple-rag/
123
+ ├── backend/
124
+ │ ├── src/
125
+ │ │ ├── rag_system.py # Simplified RAG system
126
+ │ │ ├── llm_manager.py # Sarvam-1 LLM management
127
+ │ │ ├── embedding_generator.py # Embedding generation
128
+ │ │ ├── qdrant_setup.py # Qdrant Cloud setup
129
+ │ │ ├── language_constants.py # Supported languages config
130
+ │ │ └── load_huggingface_dataset.py # HF dataset loader
131
+ │ ├── main.py # FastAPI application
132
+ │ ├── requirements.txt
133
+ │ └── Dockerfile
134
+ ├── frontend/
135
+ │ ├── app.py # Streamlit frontend
136
+ │ ├── requirements.txt
137
+ │ └── Dockerfile
138
+ ├── docker-compose.yml
139
+ ├── .env.example
140
+ └── README.md
141
+ ```
142
+
143
+ ## API Endpoints
144
+
145
+ ### POST /query
146
+ Query the RAG system.
147
+
148
+ ```json
149
+ {
150
+ "query": "प्रकृति का वर्णन कैसे किया गया है?",
151
+ "top_k": 5,
152
+ "language": "hi"
153
+ }
154
+ ```
155
+
156
+ Response:
157
+ ```json
158
+ {
159
+ "question": "प्रकृति का वर्णन कैसे किया गया है?",
160
+ "answer": "...",
161
+ "user_selected_language": "hi",
162
+ "supported_languages": [...]
163
+ }
164
+ ```
165
+
166
+ ### GET /languages
167
+ Get list of supported languages.
168
+
169
+ ### GET /health
170
+ Health check endpoint.
171
+
172
+ ## Docker Deployment
173
+
174
+ ```bash
175
+ # Set environment variables
176
+ export QDRANT_URL=https://your-cluster.qdrant.tech
177
+ export QDRANT_API_KEY=your-api-key
178
+
179
+ # Build and run all services
180
+ docker-compose up --build
181
+ ```
182
+
183
+ ## Differences from HindiRAG
184
+
185
+ - **No Language Detection**: User selects language manually
186
+ - **Simplified UI**: No sidebar, no document display, no document count
187
+ - **Direct Processing**: Query goes directly to Qdrant without language analysis
188
+ - **LLM Language Handling**: The LLM prompt instructs it to detect and respond in the query language
189
+ - **Separate Backend/Frontend**: FastAPI backend, Streamlit frontend
190
+ - **Qdrant Cloud**: Uses managed Qdrant Cloud instead of local instance
191
+ - **No Local Data**: Uses HuggingFace datasets only, no local file dependencies
192
+
193
+ ## Troubleshooting
194
+
195
+ ### LLM Initialization Failed
196
+ - Ensure you have enough memory for Sarvam-1 model (~8GB)
197
+ - Check internet connection for model download
198
+
199
+ ### Qdrant Connection Error
200
+ - Verify your QDRANT_URL and QDRANT_API_KEY in `.env`
201
+ - Check that your Qdrant Cloud cluster is running
202
+ - Ensure network access to Qdrant Cloud
203
+
204
+ ### Backend Connection Error
205
+ - Ensure backend is running on port 8000
206
+ - Check `BACKEND_URL` in frontend configuration
207
+
208
+ ### No Documents Found
209
+ - Make sure your Qdrant collection has documents loaded
210
+ - Run the HuggingFace dataset loader to populate Qdrant
211
+
212
+ ## Setting up Qdrant Cloud
213
+
214
+ 1. Go to https://cloud.qdrant.io/
215
+ 2. Create a free account
216
+ 3. Create a new cluster
217
+ 4. Copy the cluster URL and API key
218
+ 5. Add them to your `.env` file
219
+
220
+ ## Loading HuggingFace Datasets
221
+
222
+ The system uses the MIRACL corpus for Indic languages. To load data:
223
+
224
+ ```bash
225
+ cd backend
226
+
227
+ # Load all 10 Indic languages (506K+ documents)
228
+ export HF_DATASETS=miracl/miracl-corpus:hi:train,miracl/miracl-corpus:bn:train,miracl/miracl-corpus:ta:train,miracl/miracl-corpus:te:train,miracl/miracl-corpus:mr:train,miracl/miracl-corpus:gu:train,miracl/miracl-corpus:kn:train,miracl/miracl-corpus:ml:train,miracl/miracl-corpus:or:train,miracl/miracl-corpus:pa:train
229
+
230
+ # Or load a single language (e.g., Hindi only - 506K docs)
231
+ export HF_DATASETS=miracl/miracl-corpus:hi:train
232
+
233
+ # Run the loader
234
+ python src/load_huggingface_dataset.py
235
+ ```
236
+
237
+ ## License
238
+
239
+ MIT License
240
+
241
+ ## Acknowledgments
242
+
243
+ - [Sarvam-1](https://sarvam.ai) for the Indic language model
244
+ - [Qdrant](https://qdrant.tech) for vector database
245
+ - [HuggingFace](https://huggingface.co) for MIRACL corpus dataset
backend/Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Create non-root user
11
+ RUN useradd -m -u 1000 appuser
12
+
13
+ # Copy requirements first for better caching
14
+ COPY requirements.txt .
15
+
16
+ # Install Python dependencies (CPU-only torch)
17
+ RUN pip install --no-cache-dir -r requirements.txt \
18
+ --extra-index-url https://download.pytorch.org/whl/cpu
19
+
20
+ # Copy application code
21
+ COPY . .
22
+
23
+ # Change ownership to non-root user
24
+ RUN chown -R appuser:appuser /app
25
+
26
+ # Switch to non-root user
27
+ USER appuser
28
+
29
+ # Expose port
30
+ EXPOSE 8000
31
+
32
+ # Run the application
33
+ CMD ["python", "main.py"]
backend/main.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Backend for Simple RAG System
3
+ """
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from fastapi import FastAPI, HTTPException
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from pydantic import BaseModel
11
+ from typing import Optional, List, Dict, Any
12
+
13
+ # Add src directory to path
14
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
15
+
16
+ from rag_system import SimpleRAGSystem
17
+ from language_constants import get_supported_languages
18
+
19
+ # Initialize FastAPI app
20
+ app = FastAPI(
21
+ title="Simple RAG API",
22
+ description="A simplified RAG system API for Indic languages",
23
+ version="1.0.0"
24
+ )
25
+
26
+ # Add CORS middleware
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+
36
+ class QueryRequest(BaseModel):
37
+ query: str
38
+ top_k: int = 5
39
+ language: Optional[str] = None
40
+
41
+
42
+ class QueryResponse(BaseModel):
43
+ question: str
44
+ answer: str
45
+ user_selected_language: Optional[str]
46
+ supported_languages: List[Dict[str, str]]
47
+
48
+
49
+ # Global RAG system instance
50
+ rag_system: Optional[SimpleRAGSystem] = None
51
+
52
+
53
+ def get_rag_system() -> SimpleRAGSystem:
54
+ """Get or initialize the RAG system"""
55
+ global rag_system
56
+ if rag_system is None:
57
+ rag_system = SimpleRAGSystem()
58
+ return rag_system
59
+
60
+
61
+ @app.get("/")
62
+ async def root():
63
+ """Root endpoint"""
64
+ return {"message": "Simple RAG API is running", "status": "healthy"}
65
+
66
+
67
+ @app.get("/health")
68
+ async def health_check():
69
+ """Health check endpoint"""
70
+ return {"status": "healthy"}
71
+
72
+
73
+ @app.get("/languages")
74
+ async def get_languages():
75
+ """Get list of supported languages"""
76
+ return {"languages": get_supported_languages()}
77
+
78
+
79
+ @app.post("/query", response_model=QueryResponse)
80
+ async def query(request: QueryRequest):
81
+ """
82
+ Query the RAG system
83
+
84
+ - query: The user's question
85
+ - top_k: Number of documents to retrieve (default: 5)
86
+ - language: Optional language code selected by user
87
+ """
88
+ try:
89
+ system = get_rag_system()
90
+ result = system.query(
91
+ question=request.query,
92
+ top_k=request.top_k,
93
+ user_selected_language=request.language
94
+ )
95
+ return QueryResponse(
96
+ question=result["question"],
97
+ answer=result["answer"],
98
+ user_selected_language=result["user_selected_language"],
99
+ supported_languages=result["supported_languages"]
100
+ )
101
+ except Exception as e:
102
+ raise HTTPException(status_code=500, detail=str(e))
103
+
104
+
105
+ if __name__ == "__main__":
106
+ import uvicorn
107
+ uvicorn.run(app, host="0.0.0.0", port=8000)
backend/requirements.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ qdrant-client>=1.7.0
3
+ fastapi>=0.104.0
4
+ uvicorn[standard]>=0.24.0
5
+ pydantic>=2.4.2
6
+ python-dotenv>=1.0.0
7
+
8
+ # ML/NLP dependencies
9
+ langchain>=0.0.331
10
+ langchain-huggingface>=0.0.1
11
+ sentence-transformers>=2.2.2
12
+ torch>=2.0.1
13
+ transformers>=4.35.0
14
+ tokenizers>=0.14.0
15
+ datasets>=2.14.6,<4.0.0
16
+ numpy>=1.24.3
17
+ pandas>=2.1.1
18
+ huggingface-hub>=0.17.3
19
+ accelerate>=0.23.0
20
+ sentencepiece>=0.1.99
21
+
22
+ # Utilities
23
+ tiktoken>=0.5.1
24
+
25
+ # Frontend
26
+ streamlit>=1.28.1
27
+ httpx>=0.25.0
backend/src/embedding_generator.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ import numpy as np
4
+ from transformers import AutoTokenizer, AutoModel
5
+ from dotenv import load_dotenv
6
+ import torch
7
+
8
+ load_dotenv()
9
+
10
+
11
+ class EmbeddingGenerator:
12
+ def __init__(self, model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
13
+ """Initialize embedding generator for multilingual text"""
14
+ self.model_name = model_name
15
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ self.model = AutoModel.from_pretrained(
17
+ model_name,
18
+ dtype=torch.float32,
19
+ device_map="cpu",
20
+ low_cpu_mem_usage=False
21
+ )
22
+ self.model = self.model.to("cpu")
23
+ self.model.eval()
24
+
25
+ def get_embedding(self, text: str) -> List[float]:
26
+ """Generate embedding for text"""
27
+ inputs = self.tokenizer(
28
+ text,
29
+ return_tensors="pt",
30
+ padding=True,
31
+ truncation=True,
32
+ max_length=512,
33
+ add_special_tokens=True
34
+ )
35
+
36
+ with torch.no_grad():
37
+ outputs = self.model(**inputs)
38
+
39
+ embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
40
+ return embeddings.tolist()
41
+
42
+ def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
43
+ """Generate embeddings for a batch of texts"""
44
+ return [self.get_embedding(text) for text in texts]
45
+
46
+
47
+ def get_embedding_function():
48
+ """Return the embedding function"""
49
+ embedder = EmbeddingGenerator()
50
+ return embedder.get_embedding
51
+
52
+
53
+ if __name__ == "__main__":
54
+ embed_gen = EmbeddingGenerator()
55
+ test_text = "Hello world, नमस्ते दुनिया"
56
+ embedding = embed_gen.get_embedding(test_text)
57
+ print(f"Embedding length: {len(embedding)}")
58
+ print(f"First 10 values: {embedding[:10]}")
backend/src/language_constants.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Supported languages configuration for the RAG system
3
+ No language detection - just defines supported languages
4
+ """
5
+
6
+ SUPPORTED_LANGUAGES = {
7
+ "hi": {"name": "Hindi", "native_name": "हिंदी"},
8
+ "bn": {"name": "Bengali", "native_name": "বাংলা"},
9
+ "gu": {"name": "Gujarati", "native_name": "ગુજરાતી"},
10
+ "kn": {"name": "Kannada", "native_name": "ಕನ್ನಡ"},
11
+ "ml": {"name": "Malayalam", "native_name": "മലയാളം"},
12
+ "mr": {"name": "Marathi", "native_name": "मराठी"},
13
+ "or": {"name": "Odia", "native_name": "ଓଡ଼ିଆ"},
14
+ "pa": {"name": "Punjabi", "native_name": "ਪੰਜਾਬੀ"},
15
+ "ta": {"name": "Tamil", "native_name": "தமிழ்"},
16
+ "te": {"name": "Telugu", "native_name": "తెలుగు"},
17
+ }
18
+
19
+
20
+ def get_supported_languages():
21
+ """Get list of supported languages"""
22
+ return [
23
+ {"code": code, "name": info["name"], "native_name": info["native_name"]}
24
+ for code, info in SUPPORTED_LANGUAGES.items()
25
+ ]
26
+
27
+
28
+ def is_language_supported(lang_code: str) -> bool:
29
+ """Check if a language code is supported"""
30
+ return lang_code in SUPPORTED_LANGUAGES
31
+
32
+
33
+ def get_language_info(lang_code: str) -> dict:
34
+ """Get language information by code"""
35
+ if lang_code in SUPPORTED_LANGUAGES:
36
+ return {
37
+ "code": lang_code,
38
+ "name": SUPPORTED_LANGUAGES[lang_code]["name"],
39
+ "native_name": SUPPORTED_LANGUAGES[lang_code]["native_name"],
40
+ }
41
+ return {"code": lang_code, "name": lang_code, "native_name": lang_code}
backend/src/llm_manager.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM Manager module with Sarvam-1 model support for Indic languages
3
+ Optimized for CPU-only environments
4
+ """
5
+ from typing import Optional, Dict, Any
6
+ from langchain_huggingface import HuggingFacePipeline
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
8
+ from huggingface_hub import login
9
+ import torch
10
+ import os
11
+ import warnings
12
+ import logging
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+ warnings.filterwarnings("ignore", category=UserWarning)
17
+
18
+
19
+ class LLMManager:
20
+ _instance = None
21
+ _llm_instance = None
22
+ _initialization_error = None
23
+
24
+ def __new__(cls):
25
+ if cls._instance is None:
26
+ cls._instance = super(LLMManager, cls).__new__(cls)
27
+ return cls._instance
28
+
29
+ def get_llm(self, provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
30
+ """Get LLM instance based on provider"""
31
+ if self._initialization_error is not None:
32
+ logger.error(f"LLM initialization failed: {self._initialization_error}")
33
+ return None
34
+
35
+ if self._llm_instance is not None:
36
+ return self._llm_instance
37
+
38
+ self._llm_instance = self._get_sarvam_llm(model_kwargs)
39
+
40
+ if self._llm_instance is None:
41
+ logger.error("Failed to initialize Sarvam-1 LLM")
42
+ self._initialization_error = "Sarvam-1 initialization failed"
43
+
44
+ return self._llm_instance
45
+
46
+ def _get_sarvam_llm(self, model_kwargs: Optional[Dict[str, Any]] = None):
47
+ """Initialize Sarvam-1 model for Indic text generation"""
48
+ model_id = "sarvamai/sarvam-1"
49
+
50
+ try:
51
+ # Authenticate with HuggingFace if token is provided
52
+ hf_token = os.getenv("HF_TOKEN")
53
+ if hf_token:
54
+ logger.info("Authenticating with HuggingFace...")
55
+ login(token=hf_token)
56
+ else:
57
+ logger.warning("No HF_TOKEN provided. Downloads may be slower.")
58
+
59
+ logger.info(f"Initializing Sarvam-1 model: {model_id}")
60
+ logger.info("Sarvam-1: 2B parameters, optimized for 10 Indic languages")
61
+ logger.info("Loading model with CPU-first approach...")
62
+
63
+ pipe = pipeline(
64
+ "text-generation",
65
+ model=model_id,
66
+ model_kwargs={
67
+ "torch_dtype": torch.float32,
68
+ "low_cpu_mem_usage": False,
69
+ },
70
+ device_map="cpu"
71
+ )
72
+
73
+ logger.info("Sarvam-1 pipeline initialized successfully on CPU")
74
+ llm = HuggingFacePipeline(pipeline=pipe)
75
+ return llm
76
+
77
+ except Exception as e:
78
+ logger.error(f"Failed to initialize Sarvam-1 model: {e}")
79
+ if "meta tensor" in str(e).lower():
80
+ logger.error("Meta tensor error: Insufficient RAM for model loading")
81
+ self._initialization_error = str(e)
82
+ return None
83
+
84
+ def is_available(self) -> bool:
85
+ return self._llm_instance is not None and self._initialization_error is None
86
+
87
+ def get_initialization_error(self) -> Optional[str]:
88
+ return self._initialization_error
89
+
90
+
91
+ def get_llm(provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
92
+ """Convenience function to get LLM instance"""
93
+ manager = LLMManager()
94
+ return manager.get_llm(provider, model_kwargs)
95
+
96
+
97
+ def get_llm_with_provider(provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
98
+ """Get LLM with specific provider and model kwargs"""
99
+ manager = LLMManager()
100
+ return manager.get_llm(provider, model_kwargs)
backend/src/load_huggingface_dataset.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Load HuggingFace datasets and ingest into Qdrant
3
+ No local file dependencies - uses only HF datasets
4
+ """
5
+ import os
6
+ import sys
7
+ import hashlib
8
+ import json
9
+ from pathlib import Path
10
+ from typing import List, Dict
11
+
12
+ # Add src directory to path
13
+ sys.path.insert(0, str(Path(__file__).parent))
14
+
15
+ from qdrant_setup import QdrantSetup
16
+ from embedding_generator import EmbeddingGenerator
17
+ from datasets import load_dataset
18
+
19
+ # Hash file to track ingested documents
20
+ HASH_FILE = "hf_datasets_hashes.json"
21
+
22
+
23
+ def get_dataset_hashes() -> Dict[str, str]:
24
+ """Load existing dataset hashes"""
25
+ if os.path.exists(HASH_FILE):
26
+ with open(HASH_FILE, "r") as f:
27
+ return json.load(f)
28
+ return {}
29
+
30
+
31
+ def save_dataset_hashes(hashes: Dict[str, str]):
32
+ """Save dataset hashes"""
33
+ with open(HASH_FILE, "w") as f:
34
+ json.dump(hashes, f, indent=2)
35
+
36
+
37
+ def compute_dataset_hash(dataset_name: str, config: str, split: str, data) -> str:
38
+ """Compute a hash for the dataset to detect changes"""
39
+ # Use dataset info for hashing
40
+ info = f"{dataset_name}:{config}:{split}:{len(data)}"
41
+ return hashlib.md5(info.encode()).hexdigest()
42
+
43
+
44
+ def parse_dataset_spec(spec: str) -> tuple:
45
+ """Parse dataset specification: name:config:split"""
46
+ parts = spec.strip().split(":")
47
+ if len(parts) == 3:
48
+ return parts[0], parts[1], parts[2]
49
+ elif len(parts) == 2:
50
+ return parts[0], parts[1], "train"
51
+ else:
52
+ return parts[0], None, "train"
53
+
54
+
55
+ def load_and_ingest_dataset(qdrant_client, collection_name: str, embedding_func,
56
+ dataset_name: str, config: str, split: str):
57
+ """Load a dataset from HuggingFace and ingest into Qdrant"""
58
+ print(f"Loading dataset: {dataset_name} (config={config}, split={split})")
59
+
60
+ try:
61
+ # Load dataset
62
+ if config:
63
+ dataset = load_dataset(dataset_name, config, split=split)
64
+ else:
65
+ dataset = load_dataset(dataset_name, split=split)
66
+
67
+ print(f" Loaded {len(dataset)} documents")
68
+
69
+ # Prepare documents for ingestion
70
+ texts_to_ingest = []
71
+ metadatas_to_ingest = []
72
+
73
+ for item in dataset:
74
+ # Extract text - handle different dataset formats
75
+ text = None
76
+ if "text" in item:
77
+ text = item["text"]
78
+ elif "content" in item:
79
+ text = item["content"]
80
+ elif "passage" in item:
81
+ text = item["passage"]
82
+ elif "document" in item:
83
+ text = item["document"]
84
+
85
+ if text and isinstance(text, str) and text.strip():
86
+ texts_to_ingest.append(text)
87
+
88
+ # Extract metadata
89
+ metadata = {
90
+ "title": item.get("title", "") or "",
91
+ "author": item.get("author", "") or "",
92
+ "genre": item.get("genre", "") or "",
93
+ "source": f"{dataset_name}:{config}:{split}",
94
+ }
95
+
96
+ # Add language info if available
97
+ if "language" in item:
98
+ metadata["language"] = item["language"]
99
+
100
+ metadatas_to_ingest.append(metadata)
101
+
102
+ if not texts_to_ingest:
103
+ print(f" No valid texts found in dataset")
104
+ return 0
105
+
106
+ print(f" Found {len(texts_to_ingest)} valid texts to ingest")
107
+
108
+ # Ingest in batches
109
+ batch_size = 100
110
+ ingested_count = 0
111
+
112
+ from qdrant_client.http import models
113
+
114
+ for i in range(0, len(texts_to_ingest), batch_size):
115
+ batch_texts = texts_to_ingest[i:i + batch_size]
116
+ batch_metadatas = metadatas_to_ingest[i:i + batch_size]
117
+
118
+ # Generate embeddings
119
+ embeddings = []
120
+ for text in batch_texts:
121
+ embedding = embedding_func(text)
122
+ embeddings.append(embedding)
123
+
124
+ # Create points
125
+ points = []
126
+ for j, (text, metadata, embedding) in enumerate(zip(batch_texts, batch_metadatas, embeddings)):
127
+ point = models.PointStruct(
128
+ id=ingested_count + j,
129
+ vector=embedding,
130
+ payload={
131
+ "full_text": text,
132
+ **metadata
133
+ }
134
+ )
135
+ points.append(point)
136
+
137
+ # Upload to Qdrant
138
+ qdrant_client.upsert(
139
+ collection_name=collection_name,
140
+ points=points
141
+ )
142
+
143
+ ingested_count += len(batch_texts)
144
+ print(f" Ingested {ingested_count}/{len(texts_to_ingest)} documents")
145
+
146
+ print(f" ✓ Successfully ingested {ingested_count} documents")
147
+ return ingested_count
148
+
149
+ except Exception as e:
150
+ print(f" Error loading dataset: {e}")
151
+ return 0
152
+
153
+
154
+ def main():
155
+ """Main function to load and ingest all configured datasets"""
156
+ print("=" * 60)
157
+ print("HuggingFace Dataset Loader for Simple RAG")
158
+ print("=" * 60)
159
+
160
+ # Get datasets from environment
161
+ hf_datasets = os.getenv("HF_DATASETS", "")
162
+
163
+ if not hf_datasets:
164
+ print("No HF_DATASETS environment variable set.")
165
+ print("Set HF_DATASETS to load datasets (e.g., miracl/miracl-corpus:hi:train)")
166
+ return
167
+
168
+ # Initialize Qdrant
169
+ print("\nInitializing Qdrant...")
170
+ qdrant_setup = QdrantSetup()
171
+ qdrant_client = qdrant_setup.get_client()
172
+ collection_name = qdrant_setup.get_collection_name()
173
+
174
+ # Create collection if not exists
175
+ qdrant_setup.create_collection()
176
+
177
+ # Initialize embedding generator
178
+ print("Initializing embedding generator...")
179
+ embedding_func = EmbeddingGenerator().get_embedding
180
+
181
+ # Load existing hashes
182
+ dataset_hashes = get_dataset_hashes()
183
+
184
+ # Parse and process datasets
185
+ dataset_specs = [spec.strip() for spec in hf_datasets.split(",")]
186
+
187
+ total_ingested = 0
188
+ for spec in dataset_specs:
189
+ if not spec:
190
+ continue
191
+
192
+ dataset_name, config, split = parse_dataset_spec(spec)
193
+ dataset_key = f"{dataset_name}:{config}:{split}"
194
+
195
+ # Check if already ingested
196
+ if dataset_key in dataset_hashes:
197
+ print(f"\nSkipping {dataset_key} (already ingested)")
198
+ continue
199
+
200
+ print(f"\nProcessing: {dataset_key}")
201
+
202
+ # Load and ingest
203
+ count = load_and_ingest_dataset(
204
+ qdrant_client, collection_name, embedding_func,
205
+ dataset_name, config, split
206
+ )
207
+
208
+ if count > 0:
209
+ # Save hash
210
+ dataset_hashes[dataset_key] = compute_dataset_hash(
211
+ dataset_name, config, split,
212
+ {"count": count}
213
+ )
214
+ save_dataset_hashes(dataset_hashes)
215
+ total_ingested += count
216
+
217
+ print("\n" + "=" * 60)
218
+ print(f"Total documents ingested: {total_ingested}")
219
+ print("=" * 60)
220
+
221
+
222
+ if __name__ == "__main__":
223
+ main()
backend/src/qdrant_setup.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import qdrant_client
2
+ from qdrant_client.http import models
3
+ from qdrant_client.http.models import Distance, VectorParams
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+
10
+ class QdrantSetup:
11
+ def __init__(self, host=None, port=None, api_key=None, https=True):
12
+ """
13
+ Initialize Qdrant client - supports both local and cloud instances
14
+ """
15
+ cloud_url = os.getenv("QDRANT_URL")
16
+ cloud_api_key = os.getenv("QDRANT_API_KEY")
17
+
18
+ if cloud_url:
19
+ self.client = qdrant_client.QdrantClient(
20
+ url=cloud_url,
21
+ api_key=cloud_api_key,
22
+ https=https
23
+ )
24
+ else:
25
+ host = host or os.getenv("QDRANT_HOST", "localhost")
26
+ port = port or int(os.getenv("QDRANT_PORT", 6333))
27
+ self.client = qdrant_client.QdrantClient(
28
+ host=host,
29
+ port=port
30
+ )
31
+
32
+ self.collection_name = "hindi_poems_stories"
33
+
34
+ def create_collection(self, vector_size=384):
35
+ """Create a collection in Qdrant for storing text embeddings"""
36
+ collections = self.client.get_collections()
37
+ collection_names = [col.name for col in collections.collections]
38
+
39
+ if self.collection_name in collection_names:
40
+ print(f"Collection '{self.collection_name}' already exists.")
41
+ return
42
+
43
+ self.client.create_collection(
44
+ collection_name=self.collection_name,
45
+ vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
46
+ )
47
+ print(f"Collection '{self.collection_name}' created successfully.")
48
+
49
+ def get_client(self):
50
+ return self.client
51
+
52
+ def get_collection_name(self):
53
+ return self.collection_name
54
+
55
+
56
+ if __name__ == "__main__":
57
+ qdrant_setup = QdrantSetup()
58
+ qdrant_setup.create_collection()
59
+ print("Qdrant setup completed!")
backend/src/rag_system.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simplified RAG system - No language detection
3
+ Passes query directly to Qdrant and LLM
4
+ """
5
+ import os
6
+ import logging
7
+ from typing import List, Dict, Optional
8
+
9
+ from qdrant_setup import QdrantSetup
10
+ from embedding_generator import EmbeddingGenerator
11
+ from llm_manager import get_llm
12
+ from language_constants import (
13
+ get_supported_languages,
14
+ is_language_supported,
15
+ get_language_info,
16
+ SUPPORTED_LANGUAGES,
17
+ )
18
+ from langchain_core.output_parsers import StrOutputParser
19
+ from qdrant_client.http import models
20
+
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ # Simplified prompt - checks if language is supported, then processes with context
26
+ QA_PROMPT = """You are a helpful assistant for literature and poetry.
27
+
28
+ IMPORTANT: Check if the query language is supported. The supported languages are:
29
+ {supported_languages_list}
30
+
31
+ If the query is in a supported language:
32
+ - Respond in the SAME language as the query
33
+ - Use the provided context to answer accurately
34
+ - If the answer is not in the context, say "I don't know based on the available documents" in the query language
35
+ - Do not hallucinate. Provide comprehensive answers based on context.
36
+
37
+ If the query is NOT in a supported language:
38
+ - Respond in English saying you only support the listed languages
39
+
40
+ Context:
41
+ {context_str}
42
+
43
+ Query: {query}
44
+
45
+ Answer:"""
46
+
47
+ UNSUPPORTED_LANG_PROMPT = """I apologize, but I only support the following languages:
48
+
49
+ {supported_languages_list}
50
+
51
+ Please try asking your question in one of these supported languages."""
52
+
53
+
54
+ class SimpleRAGSystem:
55
+ def __init__(self, llm_provider=None, model_kwargs=None):
56
+ """Initialize the simplified RAG system"""
57
+ # Setup Qdrant client
58
+ qdrant_setup = QdrantSetup()
59
+ self.qdrant_client = qdrant_setup.get_client()
60
+ self.collection_name = qdrant_setup.get_collection_name()
61
+
62
+ # Setup embedding generator
63
+ self.embedding_generator = EmbeddingGenerator()
64
+
65
+ # Setup LLM
66
+ if llm_provider or model_kwargs:
67
+ self.llm = get_llm(provider=llm_provider, model_kwargs=model_kwargs)
68
+ else:
69
+ self.llm = get_llm()
70
+
71
+ if self.llm is None:
72
+ logger.error("LLM initialization failed. RAG system will have limited functionality.")
73
+
74
+ self.supported_languages = get_supported_languages()
75
+ self.supported_language_codes = set(lang["code"] for lang in self.supported_languages)
76
+
77
+ def _get_supported_languages_list(self) -> str:
78
+ """Get formatted list of supported languages"""
79
+ return "\n".join([
80
+ f" - {lang['name']} ({lang['native_name']})"
81
+ for lang in self.supported_languages
82
+ ])
83
+
84
+ def retrieve_relevant_documents(self, query: str, top_k: int = 5) -> List[Dict]:
85
+ """Retrieve relevant documents from Qdrant based on the query"""
86
+ query_embedding = self.embedding_generator.get_embedding(query)
87
+
88
+ search_result = self.qdrant_client.query_points(
89
+ collection_name=self.collection_name,
90
+ query=query_embedding,
91
+ limit=top_k
92
+ )
93
+
94
+ retrieved_docs = []
95
+ for result in search_result.points:
96
+ payload = result.payload if result.payload is not None else {}
97
+ doc = {
98
+ "score": result.score or 0,
99
+ "title": payload.get("title", "") if payload else "",
100
+ "author": payload.get("author", "") if payload else "",
101
+ "genre": payload.get("genre", "") if payload else "",
102
+ "text": payload.get("full_text", "") if payload else "",
103
+ "source_file": payload.get("source_file", "") if payload else "",
104
+ }
105
+ retrieved_docs.append(doc)
106
+
107
+ return retrieved_docs
108
+
109
+ def generate_answer(self, query: str, context_docs: List[Dict], user_selected_language: Optional[str] = None) -> str:
110
+ """
111
+ Generate an answer based on the query and retrieved documents
112
+
113
+ Args:
114
+ query: User query
115
+ context_docs: Retrieved context documents
116
+ user_selected_language: Optional language code selected by user in frontend
117
+ """
118
+ if self.llm is None:
119
+ logger.error("LLM not initialized. Returning synthesized answer from documents.")
120
+ return self._synthesize_answer(query, context_docs)
121
+
122
+ # Use user-selected language or try to infer from query context
123
+ # For simplicity, we just pass the query and let the LLM handle it
124
+ # The prompt instructs the LLM to respond in the same language as the query
125
+
126
+ limited_docs = context_docs[:5] if len(context_docs) > 5 else context_docs
127
+
128
+ # Format context
129
+ formatted_contexts = []
130
+ for i, doc in enumerate(limited_docs, 1):
131
+ text_snippet = doc["text"][:400] + "..." if len(doc["text"]) > 400 else doc["text"]
132
+ formatted_context = f"[{i}] Title: {doc['title']}\nAuthor: {doc['author']}\nGenre: {doc['genre']}\nContent: {text_snippet}\nScore: {doc['score']:.3f}\n"
133
+ formatted_contexts.append(formatted_context)
134
+
135
+ context_str = "\n\n".join(formatted_contexts)
136
+
137
+ # Build prompt
138
+ prompt_text = QA_PROMPT.format(
139
+ supported_languages_list=self._get_supported_languages_list(),
140
+ context_str=context_str,
141
+ query=query
142
+ )
143
+
144
+ try:
145
+ chain = self.llm | StrOutputParser()
146
+ response = chain.invoke(prompt_text)
147
+
148
+ if response and len(response.strip()) > 10:
149
+ return response.strip()
150
+ else:
151
+ logger.warning("LLM returned empty or minimal response. Using document synthesis.")
152
+ except Exception as e:
153
+ logger.error(f"LLM generation failed: {e}")
154
+
155
+ return self._synthesize_answer(query, context_docs)
156
+
157
+ def _synthesize_answer(self, query: str, context_docs: List[Dict]) -> str:
158
+ """Synthesize an answer from retrieved documents when LLM is unavailable"""
159
+ if not context_docs:
160
+ return f"Sorry, no relevant documents found for: '{query}'"
161
+
162
+ synthesized_answer = f"Question: {query}\n\n"
163
+ synthesized_answer += "Information from retrieved documents:\n\n"
164
+
165
+ for i, doc in enumerate(context_docs[:3], 1):
166
+ synthesized_answer += f"{i}. {doc['title']} - {doc['author']} (Score: {doc['score']:.3f})\n"
167
+ text_preview = doc["text"][:500] + "..." if len(doc["text"]) > 500 else doc["text"]
168
+ synthesized_answer += f" Summary: {text_preview}\n\n"
169
+
170
+ synthesized_answer += "Information synthesized from the above documents."
171
+ return synthesized_answer
172
+
173
+ def query(self, question: str, top_k: int = 5, user_selected_language: Optional[str] = None) -> Dict:
174
+ """
175
+ Main query method that retrieves documents and generates an answer
176
+
177
+ Args:
178
+ question: User question
179
+ top_k: Number of documents to retrieve
180
+ user_selected_language: Optional language code selected by user
181
+ """
182
+ relevant_docs = self.retrieve_relevant_documents(question, top_k)
183
+ answer = self.generate_answer(question, relevant_docs, user_selected_language)
184
+
185
+ return {
186
+ "question": question,
187
+ "answer": answer,
188
+ "user_selected_language": user_selected_language,
189
+ "relevant_documents": relevant_docs,
190
+ "supported_languages": self.supported_languages,
191
+ }
192
+
193
+ def get_supported_languages(self) -> List[Dict]:
194
+ """Get list of supported languages"""
195
+ return self.supported_languages
docker-compose.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ backend:
3
+ build:
4
+ context: ./backend
5
+ dockerfile: Dockerfile
6
+ ports:
7
+ - "8000:8000"
8
+ env_file:
9
+ - .env
10
+ environment:
11
+ - BACKEND_URL=http://localhost:8000
12
+
13
+ frontend:
14
+ build:
15
+ context: ./frontend
16
+ dockerfile: Dockerfile
17
+ ports:
18
+ - "8501:8501"
19
+ environment:
20
+ - BACKEND_URL=http://backend:8000
21
+ depends_on:
22
+ - backend
frontend/Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements first for better caching
11
+ COPY requirements.txt .
12
+
13
+ # Install Python dependencies
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy application code
17
+ COPY . .
18
+
19
+ # Expose port
20
+ EXPOSE 8501
21
+
22
+ # Run the application
23
+ CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0", "--server.port", "8501"]
frontend/app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Streamlit Frontend for Simple RAG System
3
+ Simplified UI - No sidebar, just query input and response
4
+ """
5
+ import streamlit as st
6
+ import httpx
7
+ import os
8
+
9
+ # Configuration
10
+ BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8000")
11
+
12
+ st.set_page_config(
13
+ page_title="Simple RAG",
14
+ page_icon="🤖",
15
+ layout="centered"
16
+ )
17
+
18
+ # Page title
19
+ st.title("🤖 Simple RAG System")
20
+ st.markdown("Ask questions in any supported Indic language. The system will respond in the same language.")
21
+
22
+ # Supported languages
23
+ SUPPORTED_LANGUAGES = [
24
+ {"code": "hi", "name": "Hindi", "native": "हिंदी"},
25
+ {"code": "bn", "name": "Bengali", "native": "বাংলা"},
26
+ {"code": "gu", "name": "Gujarati", "native": "ગુજરાતી"},
27
+ {"code": "kn", "name": "Kannada", "native": "ಕನ್ನಡ"},
28
+ {"code": "ml", "name": "Malayalam", "native": "മലയാളം"},
29
+ {"code": "mr", "name": "Marathi", "native": "मराठी"},
30
+ {"code": "or", "name": "Odia", "native": "ଓଡ଼ିଆ"},
31
+ {"code": "pa", "name": "Punjabi", "native": "ਪੰਜਾਬੀ"},
32
+ {"code": "ta", "name": "Tamil", "native": "தமிழ்"},
33
+ {"code": "te", "name": "Telugu", "native": "తెలుగు"},
34
+ ]
35
+
36
+ # Language selection
37
+ st.subheader("Select Language")
38
+ language_options = [f"{lang['native']} ({lang['name']})" for lang in SUPPORTED_LANGUAGES]
39
+ selected_language = st.selectbox(
40
+ "Choose a language",
41
+ options=language_options,
42
+ index=0,
43
+ label_visibility="collapsed"
44
+ )
45
+
46
+ # Get language code from selection
47
+ selected_lang_code = SUPPORTED_LANGUAGES[language_options.index(selected_language)]["code"]
48
+
49
+ # Query input
50
+ st.subheader("Ask Your Question")
51
+ query = st.text_area(
52
+ "Enter your question",
53
+ height=100,
54
+ placeholder=f"Type your question in {selected_language}...",
55
+ label_visibility="collapsed"
56
+ )
57
+
58
+ # Submit button
59
+ col1, col2 = st.columns([1, 4])
60
+ with col1:
61
+ submit_button = st.button("🔍 Search", type="primary", use_container_width=True)
62
+
63
+ # Process query
64
+ if submit_button and query.strip():
65
+ with st.spinner("Searching and generating answer..."):
66
+ try:
67
+ # Make API request
68
+ response = httpx.post(
69
+ f"{BACKEND_URL}/query",
70
+ json={
71
+ "query": query,
72
+ "top_k": 5,
73
+ "language": selected_lang_code
74
+ },
75
+ timeout=120.0
76
+ )
77
+
78
+ if response.status_code == 200:
79
+ result = response.json()
80
+
81
+ # Display answer
82
+ st.subheader("Answer")
83
+ st.write(result["answer"])
84
+
85
+ else:
86
+ st.error(f"Error: {response.status_code} - {response.text}")
87
+
88
+ except httpx.ConnectError:
89
+ st.error(f"Could not connect to backend at {BACKEND_URL}. Make sure the backend is running.")
90
+ except Exception as e:
91
+ st.error(f"Error: {str(e)}")
92
+
93
+ # Footer
94
+ st.divider()
95
+ st.markdown(
96
+ """
97
+ <style>
98
+ .footer {
99
+ text-align: center;
100
+ padding: 20px;
101
+ color: #666;
102
+ }
103
+ </style>
104
+ <div class="footer">
105
+ Simple RAG System - Powered by Sarvam-1 and Qdrant
106
+ </div>
107
+ """,
108
+ unsafe_allow_html=True
109
+ )
frontend/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit>=1.28.1
2
+ httpx>=0.25.0