hardik-0212 commited on
Commit
77d2114
ยท
verified ยท
1 Parent(s): 0d0f98d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +361 -0
app.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from pathlib import Path
5
+ import faiss
6
+ import pickle
7
+ import json
8
+ from typing import List, Dict, Tuple, Any, Optional
9
+ import logging
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
12
+ from fastapi import FastAPI, UploadFile, File, HTTPException
13
+ from fastapi.staticfiles import StaticFiles
14
+ from fastapi.responses import HTMLResponse, FileResponse
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from pydantic import BaseModel
17
+ import docx
18
+ from openpyxl import load_workbook
19
+ import PyPDF2
20
+ import fitz
21
+ from sentence_transformers import SentenceTransformer
22
+ import nltk
23
+ from nltk.tokenize import sent_tokenize
24
+
25
+ app = FastAPI(title="RAG System API", description="Document Search and Query System")
26
+
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ logging.basicConfig(level=logging.INFO)
36
+ logger = logging.getLogger(__name__)
37
+
38
+ try:
39
+ nltk.download('punkt', quiet=True)
40
+ except:
41
+ logger.warning("Could not download NLTK data")
42
+
43
+ @dataclass
44
+ class DocumentChunk:
45
+ content: str
46
+ source: str
47
+ page_number: int
48
+ chunk_index: int
49
+ embedding: np.ndarray = None
50
+
51
+ class DocumentResponse(BaseModel):
52
+ message: str
53
+ files: List[str]
54
+ success: bool
55
+
56
+ class QueryRequest(BaseModel):
57
+ question: str
58
+ top_k: int = 3
59
+
60
+ class QueryResponse(BaseModel):
61
+ answer: str
62
+ sources: List[str]
63
+ confidence: float
64
+ chunks: List[Dict[str, Any]]
65
+
66
+ class DocumentProcessor:
67
+ def __init__(self):
68
+ self.supported_formats = ['.csv', '.docx', '.xlsx', '.xls', '.pdf', '.txt']
69
+
70
+ def load_csv(self, file_path: str) -> List[Dict]:
71
+ try:
72
+ df = pd.read_csv(file_path)
73
+ return [{
74
+ 'content': " | ".join([f"{col}:{val}" for col, val in row.items()]),
75
+ 'source': file_path,
76
+ 'page_number': idx + 1,
77
+ 'metadata': {'row_index': idx}
78
+ } for idx, row in df.iterrows()]
79
+ except Exception as e:
80
+ logger.error(f"CSV Error: {e}")
81
+ return []
82
+
83
+ def load_txt(self, file_path: str) -> List[Dict]:
84
+ try:
85
+ with open(file_path, 'r', encoding='utf-8') as f:
86
+ content = f.read()
87
+ return [{
88
+ 'content': content,
89
+ 'source': file_path,
90
+ 'page_number': 1,
91
+ 'metadata': {'content_type': 'text'}
92
+ }]
93
+ except Exception as e:
94
+ logger.error(f"TXT Error: {e}")
95
+ return []
96
+
97
+ def load_pdf(self, file_path: str) -> List[Dict]:
98
+ try:
99
+ doc = fitz.open(file_path)
100
+ return [{
101
+ 'content': page.get_text().strip(),
102
+ 'source': file_path,
103
+ 'page_number': i + 1,
104
+ 'metadata': {'total_pages': len(doc)}
105
+ } for i, page in enumerate(doc) if page.get_text().strip()]
106
+ except Exception as e:
107
+ logger.error(f"PDF Error: {e}")
108
+ return []
109
+
110
+ def load_docx(self, file_path: str) -> List[Dict]:
111
+ try:
112
+ doc = docx.Document(file_path)
113
+ content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
114
+ return [{
115
+ 'content': content,
116
+ 'source': file_path,
117
+ 'page_number': 1,
118
+ 'metadata': {'content_type': 'docx'}
119
+ }]
120
+ except Exception as e:
121
+ logger.error(f"DOCX Error: {e}")
122
+ return []
123
+
124
+ def load_excel(self, file_path: str) -> List[Dict]:
125
+ try:
126
+ wb = load_workbook(file_path)
127
+ results = []
128
+ for sheet_name in wb.sheetnames:
129
+ ws = wb[sheet_name]
130
+ data = []
131
+ for row in ws.iter_rows(values_only=True):
132
+ if any(cell is not None for cell in row):
133
+ data.append([str(cell) if cell is not None else "" for cell in row])
134
+
135
+ content = "\n".join([" | ".join(row) for row in data])
136
+ results.append({
137
+ 'content': content,
138
+ 'source': f"{file_path} - {sheet_name}",
139
+ 'page_number': 1,
140
+ 'metadata': {'sheet_name': sheet_name}
141
+ })
142
+ return results
143
+ except Exception as e:
144
+ logger.error(f"Excel Error: {e}")
145
+ return []
146
+
147
+ def process_document(self, file_path: str) -> List[Dict]:
148
+ ext = Path(file_path).suffix.lower()
149
+ if ext == '.csv':
150
+ return self.load_csv(file_path)
151
+ elif ext == '.txt':
152
+ return self.load_txt(file_path)
153
+ elif ext == '.pdf':
154
+ return self.load_pdf(file_path)
155
+ elif ext == '.docx':
156
+ return self.load_docx(file_path)
157
+ elif ext in ['.xlsx', '.xls']:
158
+ return self.load_excel(file_path)
159
+ else:
160
+ return []
161
+
162
+ class TextChunker:
163
+ def __init__(self, chunk_size=512, overlap=50):
164
+ self.chunk_size = chunk_size
165
+ self.overlap = overlap
166
+
167
+ def chunk_document(self, doc_data: Dict) -> List[DocumentChunk]:
168
+ content = doc_data['content']
169
+ if len(content) <= self.chunk_size:
170
+ return [DocumentChunk(
171
+ content=content,
172
+ source=doc_data['source'],
173
+ page_number=doc_data['page_number'],
174
+ chunk_index=0
175
+ )]
176
+
177
+ sentences = sent_tokenize(content)
178
+ chunks, current_chunk = [], ""
179
+ for sentence in sentences:
180
+ if len(current_chunk) + len(sentence) > self.chunk_size:
181
+ chunks.append(current_chunk.strip())
182
+ current_chunk = " ".join(current_chunk.split()[-self.overlap:]) + " " + sentence
183
+ else:
184
+ current_chunk += " " + sentence if current_chunk else sentence
185
+
186
+ if current_chunk.strip():
187
+ chunks.append(current_chunk.strip())
188
+
189
+ return [DocumentChunk(
190
+ content=chunk,
191
+ source=doc_data['source'],
192
+ page_number=doc_data['page_number'],
193
+ chunk_index=i
194
+ ) for i, chunk in enumerate(chunks)]
195
+
196
+ class VectorStore:
197
+ def __init__(self, dimension=384):
198
+ self.index = faiss.IndexFlatIP(dimension)
199
+ self.chunks = []
200
+
201
+ def add_chunks(self, chunks: List[DocumentChunk], embeddings: np.ndarray):
202
+ faiss.normalize_L2(embeddings)
203
+ self.index.add(embeddings.astype('float32'))
204
+ self.chunks.extend(chunks)
205
+
206
+ def search(self, query_embedding: np.ndarray, k=5) -> List[Tuple[DocumentChunk, float]]:
207
+ query_embedding = query_embedding.reshape(1, -1).astype('float32')
208
+ faiss.normalize_L2(query_embedding)
209
+ scores, indices = self.index.search(query_embedding, k)
210
+ return [(self.chunks[i], float(s)) for s, i in zip(scores[0], indices[0]) if i < len(self.chunks)]
211
+
212
+ class RAGSystem:
213
+ def __init__(self):
214
+ self.processor = DocumentProcessor()
215
+ self.chunker = TextChunker()
216
+ self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
217
+ self.vector_store = VectorStore()
218
+ self.llm = None
219
+ self.indexed_files = []
220
+
221
+ def index_documents(self, file_paths: List[str]) -> bool:
222
+ all_chunks = []
223
+ for path in file_paths:
224
+ logger.info(f"Processing document: {path}")
225
+ docs = self.processor.process_document(path)
226
+ for doc in docs:
227
+ all_chunks.extend(self.chunker.chunk_document(doc))
228
+
229
+ if not all_chunks:
230
+ return False
231
+
232
+ logger.info(f"Generated {len(all_chunks)} chunks, creating embeddings...")
233
+ embeddings = self.embedder.encode([c.content for c in all_chunks])
234
+ self.vector_store.add_chunks(all_chunks, embeddings)
235
+ self.indexed_files.extend(file_paths)
236
+ logger.info(f"Successfully indexed {len(file_paths)} documents")
237
+ return True
238
+
239
+ def query(self, question: str, top_k=3) -> Dict[str, Any]:
240
+ logger.info(f"Processing query: {question}")
241
+ query_embed = self.embedder.encode([question])[0]
242
+ results = self.vector_store.search(query_embed, top_k)
243
+
244
+ if not results:
245
+ return {
246
+ 'answer': "No relevant information found in the uploaded documents.",
247
+ 'sources': [],
248
+ 'confidence': 0.0,
249
+ 'chunks': []
250
+ }
251
+
252
+ context = "\n\n".join([f"Source: {c.source}\nContent: {c.content}" for c, _ in results])
253
+
254
+ if self.llm:
255
+ prompt = f"Based on the following context, answer the question: {question}\n\nContext:\n{context}\n\nAnswer:"
256
+ answer = self.llm(prompt, max_tokens=512, temperature=0.7)['choices'][0]['text'].strip()
257
+ else:
258
+ answer = f"Based on the retrieved documents, here are the most relevant sections:\n\n{context[:1000]}..."
259
+
260
+ return {
261
+ 'answer': answer,
262
+ 'sources': list(set([c.source for c, _ in results])),
263
+ 'confidence': float(np.mean([s for _, s in results])),
264
+ 'chunks': [{
265
+ 'source': c.source,
266
+ 'content': c.content[:500] + ("..." if len(c.content) > 500 else ""),
267
+ 'score': float(s)
268
+ } for c, s in results]
269
+ }
270
+
271
+ rag = RAGSystem()
272
+
273
+
274
+ @app.get("/", response_class=HTMLResponse)
275
+ async def serve_frontend():
276
+ try:
277
+ with open("index.html", "r", encoding="utf-8") as f:
278
+ return HTMLResponse(content=f.read())
279
+ except FileNotFoundError:
280
+ return HTMLResponse(
281
+ content="<h1>Frontend not found</h1><p>Please place index.html in the same directory as app.py</p>",
282
+ status_code=404
283
+ )
284
+
285
+ @app.post("/upload", response_model=DocumentResponse)
286
+ async def upload_files(files: List[UploadFile] = File(...)):
287
+ """Upload and process documents"""
288
+ saved_paths = []
289
+
290
+ os.makedirs("data", exist_ok=True)
291
+
292
+ try:
293
+ for file in files:
294
+ file_path = f"data/{file.filename}"
295
+ with open(file_path, "wb") as f:
296
+ content = await file.read()
297
+ f.write(content)
298
+ saved_paths.append(file_path)
299
+ logger.info(f"Saved file: {file_path}")
300
+
301
+ success = rag.index_documents(saved_paths)
302
+
303
+ return DocumentResponse(
304
+ message=f"Successfully processed {len(saved_paths)} files" if success else "Failed to process files",
305
+ files=[Path(p).name for p in saved_paths],
306
+ success=success
307
+ )
308
+
309
+ except Exception as e:
310
+ logger.error(f"Upload error: {e}")
311
+ for path in saved_paths:
312
+ try:
313
+ os.remove(path)
314
+ except:
315
+ pass
316
+ raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
317
+
318
+ @app.post("/query", response_model=QueryResponse)
319
+ async def process_query(query: QueryRequest):
320
+ """Process a query against the indexed documents"""
321
+ try:
322
+ if len(rag.vector_store.chunks) == 0:
323
+ raise HTTPException(
324
+ status_code=400,
325
+ detail="No documents have been uploaded and indexed yet. Please upload documents first."
326
+ )
327
+
328
+ result = rag.query(query.question, query.top_k)
329
+ return QueryResponse(**result)
330
+
331
+ except HTTPException:
332
+ raise
333
+ except Exception as e:
334
+ logger.error(f"Query error: {e}")
335
+ raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
336
+
337
+ @app.get("/health")
338
+ def health_check():
339
+ """Health check endpoint"""
340
+ return {
341
+ "status": "running",
342
+ "indexed_files": len(rag.indexed_files),
343
+ "total_chunks": len(rag.vector_store.chunks)
344
+ }
345
+
346
+ if __name__ == "__main__":
347
+ import uvicorn
348
+
349
+ os.makedirs("data", exist_ok=True)
350
+
351
+ print("๐Ÿš€ Starting RAG System Server...")
352
+ print("๐Ÿ“ Frontend will be available at: http://localhost:8000")
353
+ print("๐Ÿ“š API docs available at: http://localhost:8000/docs")
354
+
355
+ uvicorn.run(
356
+ app,
357
+ host="0.0.0.0",
358
+ port=8000,
359
+ reload=True,
360
+ log_level="info"
361
+ )