JigneshPrajapati18 commited on
Commit
3ee59ee
·
verified ·
1 Parent(s): 77dc82d

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ rag_storage/vector_store.faiss filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ gcc \
7
+ g++ \
8
+ build-essential \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install --upgrade pip
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ COPY . .
16
+
17
+ RUN mkdir -p /app/templates /app/static
18
+ RUN mkdir -p /app/uploads /app/documents
19
+
20
+ EXPOSE 7860
21
+
22
+ CMD ["python", "app.py"]
RAG.py ADDED
@@ -0,0 +1,1285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # import re
3
+ # import fitz
4
+ # import nltk
5
+ # import numpy as np
6
+ # import pandas as pd
7
+ # from typing import List, Dict, Tuple, Any, Optional
8
+ # from sentence_transformers import SentenceTransformer
9
+ # from nltk.tokenize import sent_tokenize
10
+ # import logging
11
+ # import json
12
+ # from sklearn.metrics.pairwise import cosine_similarity
13
+ # import torch
14
+ # from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, logging as hf_logging
15
+ # from pathlib import Path
16
+ # import faiss
17
+ # from unstructured.partition.auto import partition
18
+ # import tempfile
19
+ # import pickle
20
+ # import shutil
21
+
22
+ # hf_logging.set_verbosity_error()
23
+
24
+ # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
25
+ # logger = logging.getLogger(__name__)
26
+
27
+ # EMBEDDING_MODEL_NAME = 'all-MiniLM-L12-v2'
28
+ # GENERATIVE_MODEL_NAME = "microsoft/phi-2"
29
+ # DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
30
+ # PHI_MAX_NEW_TOKENS = 250
31
+ # PHI_TEMPERATURE = 0.3
32
+ # QUERY_SIMILARITY_THRESHOLD = 0.50
33
+ # CHUNK_SIZE = 100
34
+ # CHUNK_OVERLAP = 30
35
+ # STORAGE_DIR = "rag_storage"
36
+
37
+ # try:
38
+ # nltk.download('punkt', quiet=True)
39
+ # logger.info("NLTK punkt found or downloaded successfully")
40
+ # except Exception as e:
41
+ # logger.warning(f"Failed to download or find NLTK punkt: {e}. Using fallback tokenization.")
42
+
43
+ # def simple_sent_tokenize(text):
44
+ # sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
45
+ # return [s for s in sentences if s.strip()]
46
+
47
+ # sent_tokenize = simple_sent_tokenize
48
+
49
+ # class DocumentProcessor:
50
+ # def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME, device: str = DEVICE):
51
+ # try:
52
+ # self.embedding_model = SentenceTransformer(embedding_model_name, device=device)
53
+ # logger.info(f"Initialized embedding model: {embedding_model_name} on device: {device}")
54
+ # self.device = device
55
+ # self.vector_store = None
56
+ # self.chunks = []
57
+ # self.doc_metadata = []
58
+ # self.storage_dir = STORAGE_DIR
59
+ # os.makedirs(self.storage_dir, exist_ok=True)
60
+ # except Exception as e:
61
+ # logger.error(f"Failed to load embedding model {embedding_model_name}: {e}")
62
+ # raise
63
+
64
+ # def save_state(self):
65
+ # """Save the current state to disk"""
66
+ # try:
67
+ # # Save FAISS index if it exists
68
+ # if self.vector_store is not None:
69
+ # faiss.write_index(self.vector_store, os.path.join(self.storage_dir, "vector_store.faiss"))
70
+
71
+ # # Save chunks and metadata
72
+ # state = {
73
+ # "chunks": self.chunks,
74
+ # "doc_metadata": self.doc_metadata
75
+ # }
76
+
77
+ # with open(os.path.join(self.storage_dir, "metadata.pkl"), "wb") as f:
78
+ # pickle.dump(state, f)
79
+
80
+ # logger.info("Successfully saved document processor state")
81
+ # return True
82
+ # except Exception as e:
83
+ # logger.error(f"Failed to save state: {e}")
84
+ # return False
85
+
86
+ # def load_state(self) -> bool:
87
+ # """Load state from disk if available"""
88
+ # try:
89
+ # faiss_path = os.path.join(self.storage_dir, "vector_store.faiss")
90
+ # metadata_path = os.path.join(self.storage_dir, "metadata.pkl")
91
+
92
+ # if os.path.exists(faiss_path) and os.path.exists(metadata_path):
93
+ # # Load FAISS index
94
+ # self.vector_store = faiss.read_index(faiss_path)
95
+
96
+ # # Load metadata and chunks
97
+ # with open(metadata_path, "rb") as f:
98
+ # state = pickle.load(f)
99
+ # self.chunks = state["chunks"]
100
+ # self.doc_metadata = state["doc_metadata"]
101
+
102
+ # logger.info(f"Successfully loaded state with {len(self.chunks)} chunks and {len(self.doc_metadata)} documents")
103
+ # return True
104
+ # else:
105
+ # logger.info("No saved state found - starting fresh")
106
+ # return False
107
+ # except Exception as e:
108
+ # logger.error(f"Failed to load state: {e}")
109
+ # return False
110
+
111
+ # def clear_state(self) -> bool:
112
+ # """Clear all stored data"""
113
+ # try:
114
+ # if os.path.exists(self.storage_dir):
115
+ # shutil.rmtree(self.storage_dir)
116
+ # os.makedirs(self.storage_dir, exist_ok=True)
117
+
118
+ # self.vector_store = None
119
+ # self.chunks = []
120
+ # self.doc_metadata = []
121
+
122
+ # logger.info("Successfully cleared all stored data")
123
+ # return True
124
+ # except Exception as e:
125
+ # logger.error(f"Failed to clear state: {e}")
126
+ # return False
127
+
128
+ # def _process_file(self, file_path: str) -> Tuple[str, str]:
129
+ # """Process different file types and extract text"""
130
+ # try:
131
+ # # Try unstructured first
132
+ # try:
133
+ # elements = partition(filename=file_path)
134
+ # text = "\n\n".join([str(el) for el in elements])
135
+ # title = Path(file_path).stem
136
+ # return text, title
137
+ # except ImportError:
138
+ # # Fallback to PyMuPDF for PDFs
139
+ # if file_path.lower().endswith('.pdf'):
140
+ # doc = fitz.open(file_path)
141
+ # text = ""
142
+ # for page in doc:
143
+ # text += page.get_text() + "\n\n"
144
+ # doc.close()
145
+ # title = Path(file_path).stem
146
+ # return text, title
147
+ # else:
148
+ # raise
149
+ # except Exception as e:
150
+ # logger.error(f"Error processing file {file_path}: {e}")
151
+ # return "", Path(file_path).stem
152
+
153
+ # def chunk_text(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
154
+ # """Split text into chunks with overlap using sentence boundaries"""
155
+ # if not text:
156
+ # return []
157
+
158
+ # try:
159
+ # sentences = sent_tokenize(text)
160
+ # except Exception as e:
161
+ # logger.error(f"Sentence tokenization failed: {e}. Using simple split.")
162
+ # sentences = re.split(r'[\n\.\?\!]+', text)
163
+ # sentences = [s.strip() for s in sentences if s.strip()]
164
+
165
+ # if not sentences:
166
+ # logger.warning("No sentences found after tokenization.")
167
+ # return [text] if len(text) <= chunk_size else [text[i:i+chunk_size] for i in range(0, len(text), chunk_size-overlap)]
168
+
169
+ # chunks = []
170
+ # current_chunk = []
171
+ # current_length = 0
172
+
173
+ # for sentence in sentences:
174
+ # sentence_len = len(sentence)
175
+ # if current_length + sentence_len > chunk_size:
176
+ # if current_chunk:
177
+ # chunks.append(" ".join(current_chunk))
178
+ # current_chunk = current_chunk[-max(1, len(current_chunk)*overlap//chunk_size):] # Keep overlap
179
+ # current_length = sum(len(s) for s in current_chunk)
180
+
181
+ # if sentence_len <= chunk_size:
182
+ # current_chunk.append(sentence)
183
+ # current_length += sentence_len
184
+ # else:
185
+ # logger.warning(f"Sentence length ({sentence_len}) exceeds chunk size ({chunk_size}). Adding as its own chunk.")
186
+ # chunks.append(sentence)
187
+ # else:
188
+ # current_chunk.append(sentence)
189
+ # current_length += sentence_len
190
+
191
+ # if current_chunk:
192
+ # chunks.append(" ".join(current_chunk))
193
+
194
+ # chunks = [c for c in chunks if c.strip()]
195
+ # logger.info(f"Split text into {len(chunks)} chunks.")
196
+ # return chunks
197
+
198
+ # def generate_embedding(self, text: str) -> Optional[np.ndarray]:
199
+ # """Generate embedding for a single text chunk"""
200
+ # if not text or not isinstance(text, str):
201
+ # logger.warning("generate_embedding called with invalid text.")
202
+ # return None
203
+ # try:
204
+ # self.embedding_model.to(self.device)
205
+ # embedding = self.embedding_model.encode(text, convert_to_numpy=True, show_progress_bar=False)
206
+ # return embedding.astype(np.float32)
207
+ # except Exception as e:
208
+ # logger.error(f"Error generating embedding: {e}")
209
+ # return None
210
+
211
+ # def add_document(self, file_path: str) -> bool:
212
+ # """Process and add a document to the vector store"""
213
+ # logger.info(f"Processing document: {file_path}")
214
+
215
+ # try:
216
+ # # Check if document already exists
217
+ # for doc in self.doc_metadata:
218
+ # if os.path.normpath(doc["path"]) == os.path.normpath(file_path):
219
+ # logger.info(f"Document '{doc['title']}' already exists in the index - skipping")
220
+ # return True
221
+
222
+ # text, title = self._process_file(file_path)
223
+ # if not text:
224
+ # logger.warning(f"No text extracted from {file_path}")
225
+ # return False
226
+
227
+ # chunks = self.chunk_text(text)
228
+ # if not chunks:
229
+ # logger.warning(f"No chunks created for {file_path}")
230
+ # return False
231
+
232
+ # # Generate embeddings for all chunks
233
+ # embeddings = []
234
+ # valid_chunks = []
235
+ # for i, chunk in enumerate(chunks):
236
+ # emb = self.generate_embedding(chunk)
237
+ # if emb is not None:
238
+ # embeddings.append(emb)
239
+ # valid_chunks.append({
240
+ # "text": chunk,
241
+ # "doc_title": title,
242
+ # "doc_path": file_path,
243
+ # "chunk_index": i
244
+ # })
245
+
246
+ # if not embeddings:
247
+ # logger.warning(f"No valid embeddings generated for {file_path}")
248
+ # return False
249
+
250
+ # embeddings = np.array(embeddings)
251
+
252
+ # # Initialize or update FAISS index
253
+ # if self.vector_store is None:
254
+ # self.vector_store = faiss.IndexFlatL2(embeddings.shape[1])
255
+ # self.vector_store.add(embeddings)
256
+ # else:
257
+ # self.vector_store.add(embeddings)
258
+
259
+ # # Store metadata
260
+ # start_idx = len(self.chunks)
261
+ # self.chunks.extend(valid_chunks)
262
+
263
+ # self.doc_metadata.append({
264
+ # "title": title,
265
+ # "path": file_path,
266
+ # "chunk_count": len(valid_chunks),
267
+ # "start_idx": start_idx,
268
+ # "end_idx": start_idx + len(valid_chunks) - 1
269
+ # })
270
+
271
+ # # Save state after each document addition
272
+ # self.save_state()
273
+
274
+ # logger.info(f"Successfully added document '{title}' with {len(valid_chunks)} chunks")
275
+ # return True
276
+
277
+ # except Exception as e:
278
+ # logger.error(f"Failed to process document {file_path}: {e}")
279
+ # return False
280
+
281
+ # def search_chunks(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
282
+ # """Search for relevant chunks using semantic similarity"""
283
+ # if self.vector_store is None or not self.chunks:
284
+ # logger.warning("No documents have been indexed yet")
285
+ # return []
286
+
287
+ # query_embedding = self.generate_embedding(query)
288
+ # if query_embedding is None:
289
+ # logger.error("Failed to generate embedding for the query")
290
+ # return []
291
+
292
+ # query_embedding = np.array([query_embedding]) # Convert to 2D array
293
+
294
+ # # Search FAISS index
295
+ # distances, indices = self.vector_store.search(query_embedding, top_k)
296
+
297
+ # # Convert to similarity scores (FAISS returns squared L2 distances)
298
+ # similarities = 1 / (1 + distances[0])
299
+
300
+ # results = []
301
+ # for idx, sim in zip(indices[0], similarities):
302
+ # if idx < 0 or idx >= len(self.chunks): # Invalid index
303
+ # continue
304
+
305
+ # chunk_data = self.chunks[idx]
306
+ # results.append({
307
+ # "text": chunk_data["text"],
308
+ # "similarity": float(sim),
309
+ # "doc_title": chunk_data["doc_title"],
310
+ # "doc_path": chunk_data["doc_path"],
311
+ # "chunk_index": chunk_data["chunk_index"]
312
+ # })
313
+
314
+ # # Sort by similarity (highest first)
315
+ # results.sort(key=lambda x: x["similarity"], reverse=True)
316
+
317
+ # # Apply threshold
318
+ # results = [r for r in results if r["similarity"] >= QUERY_SIMILARITY_THRESHOLD]
319
+
320
+ # if not results and top_k > 0:
321
+ # logger.info("No chunks met similarity threshold, returning top result anyway")
322
+ # return results[:1]
323
+
324
+ # return results
325
+
326
+ # class RAGSystem:
327
+ # def __init__(self):
328
+ # logger.info("Initializing RAG System...")
329
+ # try:
330
+ # self.doc_processor = DocumentProcessor(embedding_model_name=EMBEDDING_MODEL_NAME, device=DEVICE)
331
+
332
+ # # Try to load existing state
333
+ # if self.doc_processor.load_state():
334
+ # logger.info("Successfully loaded existing document index")
335
+ # else:
336
+ # logger.info("Starting with a fresh document index")
337
+
338
+ # logger.info(f"Loading Generative LLM: {GENERATIVE_MODEL_NAME} on {DEVICE}...")
339
+ # try:
340
+ # phi_tokenizer = AutoTokenizer.from_pretrained(GENERATIVE_MODEL_NAME, trust_remote_code=True)
341
+ # model_kwargs = {"trust_remote_code": True}
342
+ # if DEVICE == 'cuda':
343
+ # if torch.cuda.is_bf16_supported():
344
+ # logger.info("Using bfloat16 for Phi-2 model.")
345
+ # model_kwargs["torch_dtype"] = torch.bfloat16
346
+ # else:
347
+ # logger.info("Using float16 for Phi-2 model.")
348
+ # model_kwargs["torch_dtype"] = torch.float16
349
+ # else:
350
+ # logger.info("Using float32 for Phi-2 model on CPU.")
351
+ # model_kwargs["torch_dtype"] = torch.float32
352
+
353
+ # phi_model = AutoModelForCausalLM.from_pretrained(GENERATIVE_MODEL_NAME, **model_kwargs)
354
+ # phi_model = phi_model.to(DEVICE)
355
+
356
+ # pipeline_device_index = 0 if DEVICE == "cuda" else -1
357
+ # self.phi_pipe = pipeline(
358
+ # "text-generation",
359
+ # model=phi_model,
360
+ # tokenizer=phi_tokenizer,
361
+ # device=pipeline_device_index
362
+ # )
363
+ # logger.info(f"✅ Generative LLM ({GENERATIVE_MODEL_NAME}) loaded successfully on {DEVICE}.")
364
+ # except Exception as e:
365
+ # logger.error(f"❌ Critical Error loading Phi-2 model: {e}")
366
+ # logger.error("RAG Q&A functionality will be disabled.")
367
+ # self.phi_pipe = None
368
+
369
+ # logger.info("✅ RAG System initialized successfully.")
370
+
371
+ # except Exception as e:
372
+ # logger.critical(f"Failed to initialize RAG System: {e}", exc_info=True)
373
+ # raise RuntimeError("System initialization failed.") from e
374
+
375
+ # def add_document(self, file_path: str) -> bool:
376
+ # """Add a document to the system"""
377
+ # return self.doc_processor.add_document(file_path)
378
+
379
+ # def ask_question(self, question: str, top_k: int = 3) -> Dict[str, Any]:
380
+ # """Answer a question using RAG"""
381
+ # if self.phi_pipe is None:
382
+ # return {
383
+ # "answer": "Error: The AI model is not available. Please check the logs.",
384
+ # "sources": []
385
+ # }
386
+
387
+ # logger.info(f"Processing question: '{question[:100]}...'")
388
+
389
+ # # Step 1: Retrieve relevant chunks
390
+ # relevant_chunks = self.doc_processor.search_chunks(question, top_k)
391
+ # if not relevant_chunks:
392
+ # return {
393
+ # "answer": "No relevant information found in documents to answer this question.",
394
+ # "sources": []
395
+ # }
396
+
397
+ # # Step 2: Prepare context for generation
398
+ # context = "\n\n---\n\n".join([
399
+ # f"Document: {chunk['doc_title']}\nChunk {chunk['chunk_index']} (Similarity: {chunk['similarity']:.2f})\n\n{chunk['text']}"
400
+ # for chunk in relevant_chunks
401
+ # ])
402
+
403
+ # # Step 3: Generate answer with Phi-2
404
+ # prompt = f"""You are a helpful assistant.Answer the question ONLY from the provided context.If the context is insufficient, just say you don't know.
405
+
406
+ # Context:
407
+ # {context}
408
+
409
+ # Question: {question}
410
+
411
+ # Answer: """
412
+
413
+ # try:
414
+ # output = self.phi_pipe(
415
+ # prompt,
416
+ # max_new_tokens=PHI_MAX_NEW_TOKENS,
417
+ # temperature=PHI_TEMPERATURE,
418
+ # do_sample=True,
419
+ # return_full_text=False,
420
+ # pad_token_id=self.phi_pipe.tokenizer.eos_token_id
421
+ # )
422
+
423
+ # generated_text = output[0]["generated_text"].strip()
424
+
425
+ # # Post-processing to clean up the response
426
+ # if "Question:" in generated_text:
427
+ # generated_text = generated_text.split("Question:")[0].strip()
428
+
429
+ # # Extract sources
430
+ # sources = []
431
+ # seen_docs = set()
432
+ # for chunk in relevant_chunks:
433
+ # if chunk['doc_title'] not in seen_docs:
434
+ # sources.append({
435
+ # "document": chunk['doc_title'],
436
+ # "path": chunk['doc_path'],
437
+ # "similarity": chunk['similarity']
438
+ # })
439
+ # seen_docs.add(chunk['doc_title'])
440
+
441
+ # return {
442
+ # "answer": generated_text,
443
+ # "sources": sources,
444
+ # "relevant_chunks": relevant_chunks # For debugging/explanation
445
+ # }
446
+
447
+ # except Exception as e:
448
+ # logger.error(f"Error generating answer: {e}")
449
+ # return {
450
+ # "answer": f"Error generating answer: {str(e)}",
451
+ # "sources": []
452
+ # }
453
+
454
+ # def explain_retrieval(self, question: str):
455
+ # """Explain the retrieval process for educational purposes"""
456
+ # print("\n=== RAG Process Explanation ===")
457
+ # print(f"Question: {question}")
458
+
459
+ # # Step 1: Show query embedding
460
+ # print("\n1. Query Embedding:")
461
+ # query_embedding = self.doc_processor.generate_embedding(question)
462
+ # if query_embedding is not None:
463
+ # print(f"- Generated {len(query_embedding)}-dimensional embedding vector")
464
+ # print(f"- Sample values: {query_embedding[:5]}...")
465
+ # else:
466
+ # print("Failed to generate query embedding")
467
+ # return
468
+
469
+ # # Step 2: Show retrieval
470
+ # print("\n2. Document Chunk Retrieval:")
471
+ # chunks = self.doc_processor.search_chunks(question, top_k=3)
472
+ # if not chunks:
473
+ # print("No relevant chunks found")
474
+ # return
475
+
476
+ # print(f"Found {len(chunks)} relevant chunks:")
477
+ # for i, chunk in enumerate(chunks, 1):
478
+ # print(f"\nChunk {i}:")
479
+ # print(f"- Source: {chunk['doc_title']}")
480
+ # print(f"- Chunk Index: {chunk['chunk_index']}")
481
+ # print(f"- Similarity Score: {chunk['similarity']:.4f}")
482
+ # print(f"- Text Preview: {chunk['text'][:150]}...")
483
+
484
+ # # Step 3: Show context preparation
485
+ # print("\n3. Context Preparation:")
486
+ # print("The top chunks are combined into a context that will be sent to the LLM")
487
+
488
+ # # Step 4: Show generation
489
+ # print("\n4. Generation with Phi-2:")
490
+ # print("The LLM is prompted to answer the question using ONLY the provided context")
491
+ # print("This helps prevent hallucination by grounding the response in the retrieved documents")
492
+
493
+ # # Show actual answer
494
+ # result = self.ask_question(question)
495
+ # print("\nFinal Answer:")
496
+ # print(result['answer'])
497
+
498
+ # print("\nSources:")
499
+ # for source in result['sources']:
500
+ # print(f"- {source['document']} (similarity: {source['similarity']:.2f})")
501
+
502
+ # def list_documents(self) -> List[Dict[str, Any]]:
503
+ # """List all indexed documents"""
504
+ # return [{
505
+ # "title": doc["title"],
506
+ # "path": doc["path"],
507
+ # "chunk_count": doc["chunk_count"]
508
+ # } for doc in self.doc_processor.doc_metadata]
509
+
510
+ # def clear_index(self) -> bool:
511
+ # """Clear all indexed documents"""
512
+ # return self.doc_processor.clear_state()
513
+
514
+ # def close(self):
515
+ # """Clean up resources"""
516
+ # logger.info("Shutting down RAG System...")
517
+ # # Save state before closing
518
+ # self.doc_processor.save_state()
519
+
520
+ # if hasattr(self, 'phi_pipe') and self.phi_pipe:
521
+ # del self.phi_pipe
522
+ # if hasattr(self.doc_processor, 'embedding_model'):
523
+ # del self.doc_processor.embedding_model
524
+ # if DEVICE == 'cuda':
525
+ # torch.cuda.empty_cache()
526
+ # logger.info("Cleared CUDA cache.")
527
+ # logger.info("RAG System shut down.")
528
+
529
+ # def main():
530
+ # rag_system = RAGSystem()
531
+
532
+ # while True:
533
+ # print("\n1. Add Document")
534
+ # print("2. Ask Question")
535
+ # print("3. Explain Retrieval Process")
536
+ # print("4. List Indexed Documents")
537
+ # print("5. Clear All Documents")
538
+ # print("6. Exit")
539
+
540
+ # choice = input("Enter your choice: ")
541
+
542
+ # if choice == "1":
543
+ # file_path = input("Enter document path (CSV, DOCX, PDF, etc.): ").strip('"')
544
+ # if not os.path.exists(file_path):
545
+ # print("File not found!")
546
+ # continue
547
+
548
+ # if rag_system.add_document(file_path):
549
+ # print("Document added successfully!")
550
+ # else:
551
+ # print("Failed to add document")
552
+
553
+ # elif choice == "2":
554
+ # question = input("Enter your question: ")
555
+ # result = rag_system.ask_question(question)
556
+ # print("\nAnswer:", result["answer"])
557
+ # if result["sources"]:
558
+ # print("\nSources:")
559
+ # for src in result["sources"]:
560
+ # print(f"- {src['document']} (similarity: {src['similarity']:.2f})")
561
+ # else:
562
+ # print("(No sources cited)")
563
+
564
+ # elif choice == "3":
565
+ # question = input("Enter a question to explain the retrieval process: ")
566
+ # rag_system.explain_retrieval(question)
567
+
568
+ # elif choice == "4":
569
+ # docs = rag_system.list_documents()
570
+ # if docs:
571
+ # print("\nIndexed Documents:")
572
+ # for i, doc in enumerate(docs, 1):
573
+ # print(f"{i}. {doc['title']} ({doc['chunk_count']} chunks)")
574
+ # print(f" Path: {doc['path']}")
575
+ # else:
576
+ # print("No documents indexed yet")
577
+
578
+ # elif choice == "5":
579
+ # confirm = input("Are you sure you want to clear ALL documents? (y/n): ")
580
+ # if confirm.lower() == 'y':
581
+ # if rag_system.clear_index():
582
+ # print("All documents cleared")
583
+ # else:
584
+ # print("Failed to clear documents")
585
+
586
+ # elif choice == "6":
587
+ # rag_system.close()
588
+ # break
589
+
590
+ # else:
591
+ # print("Invalid choice")
592
+
593
+ # if __name__ == "__main__":
594
+ # main()
595
+
596
+
597
+
598
+
599
+
600
+
601
+
602
+
603
+
604
+ import os
605
+ import re
606
+ import fitz
607
+ import nltk
608
+ import numpy as np
609
+ import pandas as pd
610
+ from typing import List, Dict, Tuple, Any, Optional
611
+ from sentence_transformers import SentenceTransformer
612
+ from nltk.tokenize import sent_tokenize
613
+ import logging
614
+ import json
615
+ from sklearn.metrics.pairwise import cosine_similarity
616
+ import torch
617
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, logging as hf_logging
618
+ from pathlib import Path
619
+ import faiss
620
+ from unstructured.partition.auto import partition
621
+ import tempfile
622
+ import pickle
623
+ import shutil
624
+
625
+ hf_logging.set_verbosity_error()
626
+
627
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
628
+ logger = logging.getLogger(__name__)
629
+
630
+ EMBEDDING_MODEL_NAME = 'all-MiniLM-L12-v2'
631
+ GENERATIVE_MODEL_NAME = "microsoft/phi-2"
632
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
633
+ PHI_MAX_NEW_TOKENS = 250
634
+ PHI_TEMPERATURE = 0.3
635
+ QUERY_SIMILARITY_THRESHOLD = 0.50
636
+ CHUNK_SIZE = 100
637
+ CHUNK_OVERLAP = 30
638
+ STORAGE_DIR = "rag_storage"
639
+
640
+ try:
641
+ nltk.download('punkt', quiet=True)
642
+ logger.info("NLTK punkt found or downloaded successfully")
643
+ except Exception as e:
644
+ logger.warning(f"Failed to download or find NLTK punkt: {e}. Using fallback tokenization.")
645
+
646
+ def simple_sent_tokenize(text):
647
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
648
+ return [s for s in sentences if s.strip()]
649
+
650
+ sent_tokenize = simple_sent_tokenize
651
+
652
+ class DocumentProcessor:
653
+ def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME, device: str = DEVICE):
654
+ try:
655
+ self.embedding_model = SentenceTransformer(embedding_model_name, device=device)
656
+ logger.info(f"Initialized embedding model: {embedding_model_name} on device: {device}")
657
+ self.device = device
658
+ self.vector_store = None
659
+ self.chunks = []
660
+ self.doc_metadata = []
661
+ self.storage_dir = STORAGE_DIR
662
+ os.makedirs(self.storage_dir, exist_ok=True)
663
+ except Exception as e:
664
+ logger.error(f"Failed to load embedding model {embedding_model_name}: {e}")
665
+ raise
666
+
667
+ def save_state(self):
668
+ """Save the current state to disk"""
669
+ try:
670
+ # Save FAISS index if it exists
671
+ if self.vector_store is not None:
672
+ faiss.write_index(self.vector_store, os.path.join(self.storage_dir, "vector_store.faiss"))
673
+
674
+ # Save chunks and metadata
675
+ state = {
676
+ "chunks": self.chunks,
677
+ "doc_metadata": self.doc_metadata
678
+ }
679
+
680
+ with open(os.path.join(self.storage_dir, "metadata.pkl"), "wb") as f:
681
+ pickle.dump(state, f)
682
+
683
+ logger.info("Successfully saved document processor state")
684
+ return True
685
+ except Exception as e:
686
+ logger.error(f"Failed to save state: {e}")
687
+ return False
688
+
689
+ def load_state(self) -> bool:
690
+ """Load state from disk if available"""
691
+ try:
692
+ faiss_path = os.path.join(self.storage_dir, "vector_store.faiss")
693
+ metadata_path = os.path.join(self.storage_dir, "metadata.pkl")
694
+
695
+ if os.path.exists(faiss_path) and os.path.exists(metadata_path):
696
+ # Load FAISS index
697
+ self.vector_store = faiss.read_index(faiss_path)
698
+
699
+ # Load metadata and chunks
700
+ with open(metadata_path, "rb") as f:
701
+ state = pickle.load(f)
702
+ self.chunks = state["chunks"]
703
+ self.doc_metadata = state["doc_metadata"]
704
+
705
+ logger.info(f"Successfully loaded state with {len(self.chunks)} chunks and {len(self.doc_metadata)} documents")
706
+ return True
707
+ else:
708
+ logger.info("No saved state found - starting fresh")
709
+ return False
710
+ except Exception as e:
711
+ logger.error(f"Failed to load state: {e}")
712
+ return False
713
+
714
+ def clear_state(self) -> bool:
715
+ """Clear all stored data"""
716
+ try:
717
+ if os.path.exists(self.storage_dir):
718
+ shutil.rmtree(self.storage_dir)
719
+ os.makedirs(self.storage_dir, exist_ok=True)
720
+
721
+ self.vector_store = None
722
+ self.chunks = []
723
+ self.doc_metadata = []
724
+
725
+ logger.info("Successfully cleared all stored data")
726
+ return True
727
+ except Exception as e:
728
+ logger.error(f"Failed to clear state: {e}")
729
+ return False
730
+
731
+ def _extract_pdf_pages(self, file_path: str) -> List[Dict[str, Any]]:
732
+ """Extract text from PDF with page numbers"""
733
+ pages = []
734
+ try:
735
+ doc = fitz.open(file_path)
736
+ for page_num in range(len(doc)):
737
+ page = doc.load_page(page_num)
738
+ text = page.get_text()
739
+ if text.strip(): # Only include pages with content
740
+ pages.append({
741
+ "page_number": page_num + 1,
742
+ "text": text
743
+ })
744
+ doc.close()
745
+ logger.info(f"Extracted {len(pages)} pages from PDF")
746
+ return pages
747
+ except Exception as e:
748
+ logger.error(f"Error extracting PDF pages: {e}")
749
+ return []
750
+
751
+ def _process_file(self, file_path: str) -> Tuple[str, str, List[Dict[str, Any]]]:
752
+ """Process different file types and extract text with page information"""
753
+ try:
754
+ title = Path(file_path).stem
755
+ pages = []
756
+
757
+ # Handle PDF files specially to extract page numbers
758
+ if file_path.lower().endswith('.pdf'):
759
+ pages = self._extract_pdf_pages(file_path)
760
+ text = "\n\n".join([page["text"] for page in pages])
761
+ return text, title, pages
762
+ else:
763
+ # For non-PDF files, try unstructured first
764
+ try:
765
+ elements = partition(filename=file_path)
766
+ text = "\n\n".join([str(el) for el in elements])
767
+ # For non-PDF files, create a single "page"
768
+ pages = [{"page_number": 1, "text": text}]
769
+ return text, title, pages
770
+ except ImportError:
771
+ # Fallback for text files
772
+ if file_path.lower().endswith(('.txt', '.csv')):
773
+ with open(file_path, 'r', encoding='utf-8') as f:
774
+ text = f.read()
775
+ pages = [{"page_number": 1, "text": text}]
776
+ return text, title, pages
777
+ else:
778
+ raise
779
+ except Exception as e:
780
+ logger.error(f"Error processing file {file_path}: {e}")
781
+ return "", Path(file_path).stem, []
782
+
783
+ def _find_chunk_page(self, chunk_text: str, pages: List[Dict[str, Any]]) -> int:
784
+ """Find which page a chunk belongs to"""
785
+ chunk_words = set(chunk_text.lower().split()[:10]) # Use first 10 words for matching
786
+
787
+ best_page = 1
788
+ best_score = 0
789
+
790
+ for page in pages:
791
+ page_words = set(page["text"].lower().split())
792
+ common_words = chunk_words.intersection(page_words)
793
+ score = len(common_words) / len(chunk_words) if chunk_words else 0
794
+
795
+ if score > best_score:
796
+ best_score = score
797
+ best_page = page["page_number"]
798
+
799
+ return best_page
800
+
801
+ def chunk_text(self, text: str, pages: List[Dict[str, Any]], chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Dict[str, Any]]:
802
+ """Split text into chunks with overlap using sentence boundaries and track page numbers"""
803
+ if not text:
804
+ return []
805
+
806
+ try:
807
+ sentences = sent_tokenize(text)
808
+ except Exception as e:
809
+ logger.error(f"Sentence tokenization failed: {e}. Using simple split.")
810
+ sentences = re.split(r'[\n\.\?\!]+', text)
811
+ sentences = [s.strip() for s in sentences if s.strip()]
812
+
813
+ if not sentences:
814
+ logger.warning("No sentences found after tokenization.")
815
+ return [{"text": text, "page_number": 1}] if len(text) <= chunk_size else [{"text": text[i:i+chunk_size], "page_number": 1} for i in range(0, len(text), chunk_size-overlap)]
816
+
817
+ chunks = []
818
+ current_chunk = []
819
+ current_length = 0
820
+
821
+ for sentence in sentences:
822
+ sentence_len = len(sentence)
823
+ if current_length + sentence_len > chunk_size:
824
+ if current_chunk:
825
+ chunk_text = " ".join(current_chunk)
826
+ page_number = self._find_chunk_page(chunk_text, pages)
827
+ chunks.append({
828
+ "text": chunk_text,
829
+ "page_number": page_number
830
+ })
831
+ current_chunk = current_chunk[-max(1, len(current_chunk)*overlap//chunk_size):] # Keep overlap
832
+ current_length = sum(len(s) for s in current_chunk)
833
+
834
+ if sentence_len <= chunk_size:
835
+ current_chunk.append(sentence)
836
+ current_length += sentence_len
837
+ else:
838
+ logger.warning(f"Sentence length ({sentence_len}) exceeds chunk size ({chunk_size}). Adding as its own chunk.")
839
+ page_number = self._find_chunk_page(sentence, pages)
840
+ chunks.append({
841
+ "text": sentence,
842
+ "page_number": page_number
843
+ })
844
+ else:
845
+ current_chunk.append(sentence)
846
+ current_length += sentence_len
847
+
848
+ if current_chunk:
849
+ chunk_text = " ".join(current_chunk)
850
+ page_number = self._find_chunk_page(chunk_text, pages)
851
+ chunks.append({
852
+ "text": chunk_text,
853
+ "page_number": page_number
854
+ })
855
+
856
+ chunks = [c for c in chunks if c["text"].strip()]
857
+ logger.info(f"Split text into {len(chunks)} chunks with page numbers.")
858
+ return chunks
859
+
860
+ def generate_embedding(self, text: str) -> Optional[np.ndarray]:
861
+ """Generate embedding for a single text chunk"""
862
+ if not text or not isinstance(text, str):
863
+ logger.warning("generate_embedding called with invalid text.")
864
+ return None
865
+ try:
866
+ self.embedding_model.to(self.device)
867
+ embedding = self.embedding_model.encode(text, convert_to_numpy=True, show_progress_bar=False)
868
+ return embedding.astype(np.float32)
869
+ except Exception as e:
870
+ logger.error(f"Error generating embedding: {e}")
871
+ return None
872
+
873
+ def add_document(self, file_path: str) -> bool:
874
+ """Process and add a document to the vector store"""
875
+ logger.info(f"Processing document: {file_path}")
876
+
877
+ try:
878
+ # Check if document already exists
879
+ for doc in self.doc_metadata:
880
+ if os.path.normpath(doc["path"]) == os.path.normpath(file_path):
881
+ logger.info(f"Document '{doc['title']}' already exists in the index - skipping")
882
+ return True
883
+
884
+ text, title, pages = self._process_file(file_path)
885
+ if not text:
886
+ logger.warning(f"No text extracted from {file_path}")
887
+ return False
888
+
889
+ chunks = self.chunk_text(text, pages)
890
+ if not chunks:
891
+ logger.warning(f"No chunks created for {file_path}")
892
+ return False
893
+
894
+ # Generate embeddings for all chunks
895
+ embeddings = []
896
+ valid_chunks = []
897
+ for i, chunk_data in enumerate(chunks):
898
+ emb = self.generate_embedding(chunk_data["text"])
899
+ if emb is not None:
900
+ embeddings.append(emb)
901
+ valid_chunks.append({
902
+ "text": chunk_data["text"],
903
+ "page_number": chunk_data["page_number"],
904
+ "doc_title": title,
905
+ "doc_path": file_path,
906
+ "chunk_index": i
907
+ })
908
+
909
+ if not embeddings:
910
+ logger.warning(f"No valid embeddings generated for {file_path}")
911
+ return False
912
+
913
+ embeddings = np.array(embeddings)
914
+
915
+ # Initialize or update FAISS index
916
+ if self.vector_store is None:
917
+ self.vector_store = faiss.IndexFlatL2(embeddings.shape[1])
918
+ self.vector_store.add(embeddings)
919
+ else:
920
+ self.vector_store.add(embeddings)
921
+
922
+ # Store metadata
923
+ start_idx = len(self.chunks)
924
+ self.chunks.extend(valid_chunks)
925
+
926
+ self.doc_metadata.append({
927
+ "title": title,
928
+ "path": file_path,
929
+ "chunk_count": len(valid_chunks),
930
+ "start_idx": start_idx,
931
+ "end_idx": start_idx + len(valid_chunks) - 1,
932
+ "total_pages": max([page["page_number"] for page in pages]) if pages else 1
933
+ })
934
+
935
+ # Save state after each document addition
936
+ self.save_state()
937
+
938
+ logger.info(f"Successfully added document '{title}' with {len(valid_chunks)} chunks")
939
+ return True
940
+
941
+ except Exception as e:
942
+ logger.error(f"Failed to process document {file_path}: {e}")
943
+ return False
944
+
945
+ def search_chunks(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
946
+ """Search for relevant chunks using semantic similarity"""
947
+ if self.vector_store is None or not self.chunks:
948
+ logger.warning("No documents have been indexed yet")
949
+ return []
950
+
951
+ query_embedding = self.generate_embedding(query)
952
+ if query_embedding is None:
953
+ logger.error("Failed to generate embedding for the query")
954
+ return []
955
+
956
+ query_embedding = np.array([query_embedding]) # Convert to 2D array
957
+
958
+ # Search FAISS index
959
+ distances, indices = self.vector_store.search(query_embedding, top_k)
960
+
961
+ # Convert to similarity scores (FAISS returns squared L2 distances)
962
+ similarities = 1 / (1 + distances[0])
963
+
964
+ results = []
965
+ for idx, sim in zip(indices[0], similarities):
966
+ if idx < 0 or idx >= len(self.chunks): # Invalid index
967
+ continue
968
+
969
+ chunk_data = self.chunks[idx]
970
+ results.append({
971
+ "text": chunk_data["text"],
972
+ "similarity": float(sim),
973
+ "doc_title": chunk_data["doc_title"],
974
+ "doc_path": chunk_data["doc_path"],
975
+ "chunk_index": chunk_data["chunk_index"],
976
+ "page_number": chunk_data["page_number"]
977
+ })
978
+
979
+ # Sort by similarity (highest first)
980
+ results.sort(key=lambda x: x["similarity"], reverse=True)
981
+
982
+ # Apply threshold
983
+ results = [r for r in results if r["similarity"] >= QUERY_SIMILARITY_THRESHOLD]
984
+
985
+ if not results and top_k > 0:
986
+ logger.info("No chunks met similarity threshold, returning top result anyway")
987
+ return results[:1]
988
+
989
+ return results
990
+
991
+ class RAGSystem:
992
+ def __init__(self):
993
+ logger.info("Initializing RAG System...")
994
+ try:
995
+ self.doc_processor = DocumentProcessor(embedding_model_name=EMBEDDING_MODEL_NAME, device=DEVICE)
996
+
997
+ # Try to load existing state
998
+ if self.doc_processor.load_state():
999
+ logger.info("Successfully loaded existing document index")
1000
+ else:
1001
+ logger.info("Starting with a fresh document index")
1002
+
1003
+ logger.info(f"Loading Generative LLM: {GENERATIVE_MODEL_NAME} on {DEVICE}...")
1004
+ try:
1005
+ phi_tokenizer = AutoTokenizer.from_pretrained(GENERATIVE_MODEL_NAME, trust_remote_code=True)
1006
+ model_kwargs = {"trust_remote_code": True}
1007
+ if DEVICE == 'cuda':
1008
+ if torch.cuda.is_bf16_supported():
1009
+ logger.info("Using bfloat16 for Phi-2 model.")
1010
+ model_kwargs["torch_dtype"] = torch.bfloat16
1011
+ else:
1012
+ logger.info("Using float16 for Phi-2 model.")
1013
+ model_kwargs["torch_dtype"] = torch.float16
1014
+ else:
1015
+ logger.info("Using float32 for Phi-2 model on CPU.")
1016
+ model_kwargs["torch_dtype"] = torch.float32
1017
+
1018
+ phi_model = AutoModelForCausalLM.from_pretrained(GENERATIVE_MODEL_NAME, **model_kwargs)
1019
+ phi_model = phi_model.to(DEVICE)
1020
+
1021
+ pipeline_device_index = 0 if DEVICE == "cuda" else -1
1022
+ self.phi_pipe = pipeline(
1023
+ "text-generation",
1024
+ model=phi_model,
1025
+ tokenizer=phi_tokenizer,
1026
+ device=pipeline_device_index
1027
+ )
1028
+ logger.info(f"✅ Generative LLM ({GENERATIVE_MODEL_NAME}) loaded successfully on {DEVICE}.")
1029
+ except Exception as e:
1030
+ logger.error(f"❌ Critical Error loading Phi-2 model: {e}")
1031
+ logger.error("RAG Q&A functionality will be disabled.")
1032
+ self.phi_pipe = None
1033
+
1034
+ logger.info("✅ RAG System initialized successfully.")
1035
+
1036
+ except Exception as e:
1037
+ logger.critical(f"Failed to initialize RAG System: {e}", exc_info=True)
1038
+ raise RuntimeError("System initialization failed.") from e
1039
+
1040
+ def add_document(self, file_path: str) -> bool:
1041
+ """Add a document to the system"""
1042
+ return self.doc_processor.add_document(file_path)
1043
+
1044
+ def ask_question(self, question: str, top_k: int = 3) -> Dict[str, Any]:
1045
+ """Answer a question using RAG"""
1046
+ if self.phi_pipe is None:
1047
+ return {
1048
+ "answer": "Error: The AI model is not available. Please check the logs.",
1049
+ "sources": [],
1050
+ "question_chunks": []
1051
+ }
1052
+
1053
+ logger.info(f"Processing question: '{question[:100]}...'")
1054
+
1055
+ # Step 1: Retrieve relevant chunks
1056
+ relevant_chunks = self.doc_processor.search_chunks(question, top_k)
1057
+ if not relevant_chunks:
1058
+ return {
1059
+ "answer": "No relevant information found in documents to answer this question.",
1060
+ "sources": [],
1061
+ "question_chunks": []
1062
+ }
1063
+
1064
+ # Step 2: Prepare context for generation
1065
+ context = "\n\n---\n\n".join([
1066
+ f"Document: {chunk['doc_title']} (Page {chunk['page_number']})\nChunk {chunk['chunk_index']} (Similarity: {chunk['similarity']:.2f})\n\n{chunk['text']}"
1067
+ for chunk in relevant_chunks
1068
+ ])
1069
+
1070
+ # Step 3: Generate answer with Phi-2
1071
+ prompt = f"""You are a helpful assistant. Answer the question ONLY from the provided context. If the context is insufficient, just say you don't know.
1072
+
1073
+ Context:
1074
+ {context}
1075
+
1076
+ Question: {question}
1077
+
1078
+ Answer: """
1079
+
1080
+ try:
1081
+ output = self.phi_pipe(
1082
+ prompt,
1083
+ max_new_tokens=PHI_MAX_NEW_TOKENS,
1084
+ temperature=PHI_TEMPERATURE,
1085
+ do_sample=True,
1086
+ return_full_text=False,
1087
+ pad_token_id=self.phi_pipe.tokenizer.eos_token_id
1088
+ )
1089
+
1090
+ generated_text = output[0]["generated_text"].strip()
1091
+
1092
+ # Post-processing to clean up the response
1093
+ if "Question:" in generated_text:
1094
+ generated_text = generated_text.split("Question:")[0].strip()
1095
+
1096
+ # Extract sources with page numbers
1097
+ sources = []
1098
+ seen_docs = set()
1099
+ for chunk in relevant_chunks:
1100
+ doc_key = f"{chunk['doc_title']}_page_{chunk['page_number']}"
1101
+ if doc_key not in seen_docs:
1102
+ sources.append({
1103
+ "document": chunk['doc_title'],
1104
+ "page_number": chunk['page_number'],
1105
+ "path": chunk['doc_path'],
1106
+ "similarity": chunk['similarity']
1107
+ })
1108
+ seen_docs.add(doc_key)
1109
+
1110
+ # Prepare question chunks for display
1111
+ question_chunks = []
1112
+ for chunk in relevant_chunks:
1113
+ question_chunks.append({
1114
+ "document": chunk['doc_title'],
1115
+ "page_number": chunk['page_number'],
1116
+ "chunk_index": chunk['chunk_index'],
1117
+ "similarity": chunk['similarity'],
1118
+ "text_preview": chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text']
1119
+ })
1120
+
1121
+ return {
1122
+ "answer": generated_text,
1123
+ "sources": sources,
1124
+ "question_chunks": question_chunks,
1125
+ "relevant_chunks": relevant_chunks # For debugging/explanation
1126
+ }
1127
+
1128
+ except Exception as e:
1129
+ logger.error(f"Error generating answer: {e}")
1130
+ return {
1131
+ "answer": f"Error generating answer: {str(e)}",
1132
+ "sources": [],
1133
+ "question_chunks": []
1134
+ }
1135
+
1136
+ def explain_retrieval(self, question: str):
1137
+ """Explain the retrieval process for educational purposes"""
1138
+ print("\n=== RAG Process Explanation ===")
1139
+ print(f"Question: {question}")
1140
+
1141
+ # Step 1: Show query embedding
1142
+ print("\n1. Query Embedding:")
1143
+ query_embedding = self.doc_processor.generate_embedding(question)
1144
+ if query_embedding is not None:
1145
+ print(f"- Generated {len(query_embedding)}-dimensional embedding vector")
1146
+ print(f"- Sample values: {query_embedding[:5]}...")
1147
+ else:
1148
+ print("Failed to generate query embedding")
1149
+ return
1150
+
1151
+ # Step 2: Show retrieval
1152
+ print("\n2. Document Chunk Retrieval:")
1153
+ chunks = self.doc_processor.search_chunks(question, top_k=3)
1154
+ if not chunks:
1155
+ print("No relevant chunks found")
1156
+ return
1157
+
1158
+ print(f"Found {len(chunks)} relevant chunks:")
1159
+ for i, chunk in enumerate(chunks, 1):
1160
+ print(f"\nChunk {i}:")
1161
+ print(f"- Source: {chunk['doc_title']} (Page {chunk['page_number']})")
1162
+ print(f"- Chunk Index: {chunk['chunk_index']}")
1163
+ print(f"- Similarity Score: {chunk['similarity']:.4f}")
1164
+ print(f"- Text Preview: {chunk['text'][:150]}...")
1165
+
1166
+ # Step 3: Show context preparation
1167
+ print("\n3. Context Preparation:")
1168
+ print("The top chunks are combined into a context that will be sent to the LLM")
1169
+
1170
+ # Step 4: Show generation
1171
+ print("\n4. Generation with Phi-2:")
1172
+ print("The LLM is prompted to answer the question using ONLY the provided context")
1173
+ print("This helps prevent hallucination by grounding the response in the retrieved documents")
1174
+
1175
+ # Show actual answer
1176
+ result = self.ask_question(question)
1177
+ print("\nFinal Answer:")
1178
+ print(result['answer'])
1179
+
1180
+ print("\nSources with Page Numbers:")
1181
+ for source in result['sources']:
1182
+ print(f"- {source['document']} (Page {source['page_number']}, similarity: {source['similarity']:.2f})")
1183
+
1184
+ def list_documents(self) -> List[Dict[str, Any]]:
1185
+ """List all indexed documents"""
1186
+ return [{
1187
+ "title": doc["title"],
1188
+ "path": doc["path"],
1189
+ "chunk_count": doc["chunk_count"],
1190
+ "total_pages": doc.get("total_pages", 1)
1191
+ } for doc in self.doc_processor.doc_metadata]
1192
+
1193
+ def clear_index(self) -> bool:
1194
+ """Clear all indexed documents"""
1195
+ return self.doc_processor.clear_state()
1196
+
1197
+ def close(self):
1198
+ """Clean up resources"""
1199
+ logger.info("Shutting down RAG System...")
1200
+ # Save state before closing
1201
+ self.doc_processor.save_state()
1202
+
1203
+ if hasattr(self, 'phi_pipe') and self.phi_pipe:
1204
+ del self.phi_pipe
1205
+ if hasattr(self.doc_processor, 'embedding_model'):
1206
+ del self.doc_processor.embedding_model
1207
+ if DEVICE == 'cuda':
1208
+ torch.cuda.empty_cache()
1209
+ logger.info("Cleared CUDA cache.")
1210
+ logger.info("RAG System shut down.")
1211
+
1212
+ def main():
1213
+ rag_system = RAGSystem()
1214
+
1215
+ while True:
1216
+ print("\n1. Add Document")
1217
+ print("2. Ask Question")
1218
+ print("3. Explain Retrieval Process")
1219
+ print("4. List Indexed Documents")
1220
+ print("5. Clear All Documents")
1221
+ print("6. Exit")
1222
+
1223
+ choice = input("Enter your choice: ")
1224
+
1225
+ if choice == "1":
1226
+ file_path = input("Enter document path (CSV, DOCX, PDF, etc.): ").strip('"')
1227
+ if not os.path.exists(file_path):
1228
+ print("File not found!")
1229
+ continue
1230
+
1231
+ if rag_system.add_document(file_path):
1232
+ print("Document added successfully!")
1233
+ else:
1234
+ print("Failed to add document")
1235
+
1236
+ elif choice == "2":
1237
+ question = input("Enter your question: ")
1238
+ result = rag_system.ask_question(question)
1239
+ print("\nAnswer:", result["answer"])
1240
+ if result["sources"]:
1241
+ print("\nSources:")
1242
+ for src in result["sources"]:
1243
+ print(f"- {src['document']} (Page {src['page_number']}, similarity: {src['similarity']:.2f})")
1244
+ else:
1245
+ print("(No sources cited)")
1246
+
1247
+ if result["question_chunks"]:
1248
+ print("\nRelevant Chunks:")
1249
+ for i, chunk in enumerate(result["question_chunks"], 1):
1250
+ print(f"{i}. {chunk['document']} (Page {chunk['page_number']}, Chunk {chunk['chunk_index']})")
1251
+ print(f" Similarity: {chunk['similarity']:.2f}")
1252
+ print(f" Preview: {chunk['text_preview']}")
1253
+ print()
1254
+
1255
+ elif choice == "3":
1256
+ question = input("Enter a question to explain the retrieval process: ")
1257
+ rag_system.explain_retrieval(question)
1258
+
1259
+ elif choice == "4":
1260
+ docs = rag_system.list_documents()
1261
+ if docs:
1262
+ print("\nIndexed Documents:")
1263
+ for i, doc in enumerate(docs, 1):
1264
+ print(f"{i}. {doc['title']} ({doc['chunk_count']} chunks, {doc['total_pages']} pages)")
1265
+ print(f" Path: {doc['path']}")
1266
+ else:
1267
+ print("No documents indexed yet")
1268
+
1269
+ elif choice == "5":
1270
+ confirm = input("Are you sure you want to clear ALL documents? (y/n): ")
1271
+ if confirm.lower() == 'y':
1272
+ if rag_system.clear_index():
1273
+ print("All documents cleared")
1274
+ else:
1275
+ print("Failed to clear documents")
1276
+
1277
+ elif choice == "6":
1278
+ rag_system.close()
1279
+ break
1280
+
1281
+ else:
1282
+ print("Invalid choice")
1283
+
1284
+ if __name__ == "__main__":
1285
+ main()
README.md CHANGED
@@ -1,11 +1,48 @@
1
- ---
2
- title: RAG QA
3
- emoji: 🌍
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Document Q&A with RAG
3
+ emoji: 📄
4
+ colorFrom: indigo
5
+ colorTo: blue # ✅ valid color now
6
+ sdk: gradio
7
+ sdk_version: "4.28.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+
13
+ # Document Q&A with RAG System
14
+
15
+ This is a Retrieval-Augmented Generation (RAG) system deployed on Hugging Face Spaces. It allows you to:
16
+
17
+ 1. Upload documents (PDF, DOCX, TXT, CSV)
18
+ 2. Ask questions about the content
19
+ 3. Get answers grounded in your documents
20
+
21
+ ## Features
22
+
23
+ - Supports multiple document formats
24
+ - Semantic search for relevant content
25
+ - Generative answers using Phi-2 model
26
+ - Persistent document storage
27
+ - Web interface and API endpoints
28
+
29
+ ## How to Use
30
+
31
+ 1. Upload documents using the upload form
32
+ 2. Ask questions in natural language
33
+ 3. View answers with cited sources
34
+
35
+ ## Technical Details
36
+
37
+ - Embedding model: `all-MiniLM-L12-v2`
38
+ - Generative model: `microsoft/phi-2`
39
+ - Vector store: FAISS
40
+ - Web framework: FastAPI + Gradio
41
+
42
+ ## Deployment
43
+
44
+ This app is automatically deployed on Hugging Face Spaces. To run locally:
45
+
46
+ ```bash
47
+ pip install -r requirements.txt
48
+ python app.py
__pycache__/RAG.cpython-312.pyc ADDED
Binary file (33.6 kB). View file
 
__pycache__/app.cpython-312.pyc ADDED
Binary file (22.9 kB). View file
 
__pycache__/rag_system.cpython-312.pyc ADDED
Binary file (25.1 kB). View file
 
app.py ADDED
@@ -0,0 +1,1379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from fastapi import FastAPI, File, UploadFile, HTTPException, Form, Request
2
+ # from fastapi.responses import HTMLResponse, JSONResponse
3
+ # from fastapi.staticfiles import StaticFiles
4
+ # from fastapi.templating import Jinja2Templates
5
+ # from pydantic import BaseModel
6
+ # import os
7
+ # import tempfile
8
+ # import shutil
9
+ # from typing import List, Dict, Any
10
+ # import logging
11
+
12
+ # import sys
13
+ # import os
14
+ # sys.path.append(os.path.dirname(os.path.abspath(__file__)))
15
+
16
+ # try:
17
+ # from RAG import RAGSystem
18
+ # except ImportError:
19
+ # print("Error: Cannot import RAGSystem from RAG.py")
20
+ # print("Make sure RAG.py is in the same directory as app.py")
21
+ # sys.exit(1)
22
+
23
+ # logging.basicConfig(level=logging.DEBUG)
24
+ # logger = logging.getLogger(__name__)
25
+
26
+ # app = FastAPI(title="RAG PDF QA System")
27
+
28
+ # # Setup templates directory
29
+ # templates = Jinja2Templates(directory="templates")
30
+
31
+ # # Try to mount static files directory
32
+ # try:
33
+ # app.mount("/static", StaticFiles(directory="static"), name="static")
34
+ # except Exception as e:
35
+ # logger.warning(f"Static files directory not found: {e}")
36
+
37
+ # # Initialize RAG System
38
+ # try:
39
+ # rag_system = RAGSystem()
40
+ # logger.info("RAG System initialized successfully")
41
+ # except Exception as e:
42
+ # logger.error(f"Failed to initialize RAG System: {e}")
43
+ # rag_system = None
44
+
45
+ # class QuestionRequest(BaseModel):
46
+ # question: str
47
+
48
+ # @app.get("/", response_class=HTMLResponse)
49
+ # async def read_root(request: Request):
50
+ # try:
51
+ # return templates.TemplateResponse("index.html", {"request": request})
52
+ # except Exception as e:
53
+ # logger.error(f"Error serving index.html from templates folder: {e}")
54
+ # return HTMLResponse(content=f"""
55
+ # <html>
56
+ # <body>
57
+ # <h1>RAG PDF QA System</h1>
58
+ # <p>Error: Could not load index.html from templates folder</p>
59
+ # <p>Error details: {str(e)}</p>
60
+ # <p>Make sure you have:</p>
61
+ # <ul>
62
+ # <li>A 'templates' folder in the same directory as app.py</li>
63
+ # <li>index.html file inside the templates folder</li>
64
+ # <li>Installed jinja2: pip install jinja2</li>
65
+ # </ul>
66
+ # </body>
67
+ # </html>
68
+ # """)
69
+
70
+ # @app.post("/upload")
71
+ # async def upload_document(file: UploadFile = File(...)):
72
+ # try:
73
+ # if rag_system is None:
74
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
75
+
76
+ # if not file.filename:
77
+ # raise HTTPException(status_code=400, detail="No file selected")
78
+
79
+ # allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
80
+ # file_extension = os.path.splitext(file.filename)[1].lower()
81
+
82
+ # if file_extension not in allowed_extensions:
83
+ # raise HTTPException(status_code=400, detail=f"File type {file_extension} not supported. Supported types: {', '.join(allowed_extensions)}")
84
+
85
+ # # Create temporary file
86
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
87
+ # shutil.copyfileobj(file.file, temp_file)
88
+ # temp_path = temp_file.name
89
+
90
+ # # Process document
91
+ # success = rag_system.add_document(temp_path)
92
+
93
+ # # Clean up temporary file
94
+ # try:
95
+ # os.unlink(temp_path)
96
+ # except Exception as cleanup_error:
97
+ # logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
98
+
99
+ # if success:
100
+ # return JSONResponse(content={"message": f"Document '{file.filename}' uploaded and processed successfully"})
101
+ # else:
102
+ # raise HTTPException(status_code=500, detail="Failed to process document")
103
+
104
+ # except HTTPException:
105
+ # raise
106
+ # except Exception as e:
107
+ # logger.error(f"Upload error: {e}", exc_info=True)
108
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
109
+
110
+ # @app.post("/ask")
111
+ # async def ask_question(request: QuestionRequest):
112
+ # try:
113
+ # if rag_system is None:
114
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
115
+
116
+ # if not request.question.strip():
117
+ # raise HTTPException(status_code=400, detail="Question cannot be empty")
118
+
119
+ # result = rag_system.ask_question(request.question)
120
+
121
+ # return JSONResponse(content={
122
+ # "answer": result["answer"],
123
+ # "sources": result["sources"]
124
+ # })
125
+
126
+ # except HTTPException:
127
+ # raise
128
+ # except Exception as e:
129
+ # logger.error(f"Question error: {e}", exc_info=True)
130
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
131
+
132
+ # @app.get("/documents")
133
+ # async def get_documents():
134
+ # try:
135
+ # if rag_system is None:
136
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
137
+
138
+ # docs = rag_system.list_documents()
139
+ # return JSONResponse(content={"documents": docs})
140
+ # except HTTPException:
141
+ # raise
142
+ # except Exception as e:
143
+ # logger.error(f"Documents list error: {e}", exc_info=True)
144
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
145
+
146
+ # @app.delete("/clear")
147
+ # async def clear_documents():
148
+ # try:
149
+ # if rag_system is None:
150
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
151
+
152
+ # success = rag_system.clear_index()
153
+ # if success:
154
+ # return JSONResponse(content={"message": "All documents cleared successfully"})
155
+ # else:
156
+ # raise HTTPException(status_code=500, detail="Failed to clear documents")
157
+ # except HTTPException:
158
+ # raise
159
+ # except Exception as e:
160
+ # logger.error(f"Clear error: {e}", exc_info=True)
161
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
162
+
163
+ # @app.get("/health")
164
+ # async def health_check():
165
+ # return {
166
+ # "status": "healthy",
167
+ # "rag_system_initialized": rag_system is not None,
168
+ # "message": "RAG PDF QA System is running"
169
+ # }
170
+
171
+ # if __name__ == "__main__":
172
+ # import uvicorn
173
+ # logger.info("Starting FastAPI server...")
174
+ # uvicorn.run(app, host="0.0.0.0", port=8000, log_level="debug")
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+ # # second code
188
+ # from fastapi import FastAPI, File, UploadFile, HTTPException, Request
189
+ # from fastapi.responses import HTMLResponse, JSONResponse
190
+ # from fastapi.staticfiles import StaticFiles
191
+ # from fastapi.templating import Jinja2Templates
192
+ # from pydantic import BaseModel
193
+ # import os
194
+ # import tempfile
195
+ # import shutil
196
+ # from typing import List, Dict, Any
197
+ # import logging
198
+
199
+ # import sys
200
+ # import os
201
+ # sys.path.append(os.path.dirname(os.path.abspath(__file__)))
202
+
203
+ # try:
204
+ # from RAG import RAGSystem
205
+ # except ImportError:
206
+ # print("Error: Cannot import RAGSystem from RAG.py")
207
+ # print("Make sure RAG.py is in the same directory as app.py")
208
+ # sys.exit(1)
209
+
210
+ # logging.basicConfig(level=logging.INFO)
211
+ # logger = logging.getLogger(__name__)
212
+
213
+ # app = FastAPI(title="RAG PDF QA System")
214
+
215
+ # # Setup templates directory
216
+ # templates = Jinja2Templates(directory="templates")
217
+
218
+ # # Try to mount static files directory
219
+ # try:
220
+ # app.mount("/static", StaticFiles(directory="static"), name="static")
221
+ # except Exception as e:
222
+ # logger.warning(f"Static files directory not found: {e}")
223
+
224
+ # # Initialize RAG System
225
+ # try:
226
+ # rag_system = RAGSystem()
227
+ # logger.info("RAG System initialized successfully")
228
+ # except Exception as e:
229
+ # logger.error(f"Failed to initialize RAG System: {e}")
230
+ # rag_system = None
231
+
232
+ # class QuestionRequest(BaseModel):
233
+ # question: str
234
+ # top_k: int = 3
235
+
236
+ # @app.get("/", response_class=HTMLResponse)
237
+ # async def read_root(request: Request):
238
+ # try:
239
+ # return templates.TemplateResponse("index.html", {"request": request})
240
+ # except Exception as e:
241
+ # logger.error(f"Error serving index.html from templates folder: {e}")
242
+ # return HTMLResponse(content=f"""
243
+ # <html>
244
+ # <body>
245
+ # <h1>RAG PDF QA System</h1>
246
+ # <p>Error: Could not load index.html from templates folder</p>
247
+ # <p>Error details: {str(e)}</p>
248
+ # <p>Make sure you have:</p>
249
+ # <ul>
250
+ # <li>A 'templates' folder in the same directory as app.py</li>
251
+ # <li>index.html file inside the templates folder</li>
252
+ # <li>Installed jinja2: pip install jinja2</li>
253
+ # </ul>
254
+ # </body>
255
+ # </html>
256
+ # """)
257
+
258
+ # @app.post("/upload")
259
+ # async def upload_document(file: UploadFile = File(...)):
260
+ # try:
261
+ # if rag_system is None:
262
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
263
+
264
+ # if not file.filename:
265
+ # raise HTTPException(status_code=400, detail="No file selected")
266
+
267
+ # allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
268
+ # file_extension = os.path.splitext(file.filename)[1].lower()
269
+
270
+ # if file_extension not in allowed_extensions:
271
+ # raise HTTPException(status_code=400, detail=f"File type {file_extension} not supported. Supported types: {', '.join(allowed_extensions)}")
272
+
273
+ # # Create temporary file
274
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
275
+ # shutil.copyfileobj(file.file, temp_file)
276
+ # temp_path = temp_file.name
277
+
278
+ # # Process document
279
+ # success = rag_system.add_document(temp_path)
280
+
281
+ # # Clean up temporary file
282
+ # try:
283
+ # os.unlink(temp_path)
284
+ # except Exception as cleanup_error:
285
+ # logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
286
+
287
+ # if success:
288
+ # return JSONResponse(content={"message": f"Document '{file.filename}' uploaded and processed successfully"})
289
+ # else:
290
+ # raise HTTPException(status_code=500, detail="Failed to process document")
291
+
292
+ # except HTTPException:
293
+ # raise
294
+ # except Exception as e:
295
+ # logger.error(f"Upload error: {e}", exc_info=True)
296
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
297
+
298
+ # @app.post("/ask")
299
+ # async def ask_question(request: QuestionRequest):
300
+ # try:
301
+ # if rag_system is None:
302
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
303
+
304
+ # if not request.question.strip():
305
+ # raise HTTPException(status_code=400, detail="Question cannot be empty")
306
+
307
+ # result = rag_system.ask_question(request.question, top_k=request.top_k)
308
+
309
+ # return JSONResponse(content={
310
+ # "answer": result["answer"],
311
+ # "sources": result["sources"],
312
+ # "question_chunks": result.get("question_chunks", []),
313
+ # "relevant_chunks": result.get("relevant_chunks", [])
314
+ # })
315
+
316
+ # except HTTPException:
317
+ # raise
318
+ # except Exception as e:
319
+ # logger.error(f"Question error: {e}", exc_info=True)
320
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
321
+
322
+ # @app.get("/documents")
323
+ # async def get_documents():
324
+ # try:
325
+ # if rag_system is None:
326
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
327
+
328
+ # docs = rag_system.list_documents()
329
+ # return JSONResponse(content={"documents": docs})
330
+ # except HTTPException:
331
+ # raise
332
+ # except Exception as e:
333
+ # logger.error(f"Documents list error: {e}", exc_info=True)
334
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
335
+
336
+ # @app.delete("/clear")
337
+ # async def clear_documents():
338
+ # try:
339
+ # if rag_system is None:
340
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
341
+
342
+ # success = rag_system.clear_index()
343
+ # if success:
344
+ # return JSONResponse(content={"message": "All documents cleared successfully"})
345
+ # else:
346
+ # raise HTTPException(status_code=500, detail="Failed to clear documents")
347
+ # except HTTPException:
348
+ # raise
349
+ # except Exception as e:
350
+ # logger.error(f"Clear error: {e}", exc_info=True)
351
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
352
+
353
+ # @app.post("/search")
354
+ # async def search_chunks(request: QuestionRequest):
355
+ # try:
356
+ # if rag_system is None:
357
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
358
+
359
+ # if not request.question.strip():
360
+ # raise HTTPException(status_code=400, detail="Search query cannot be empty")
361
+
362
+ # chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
363
+
364
+ # return JSONResponse(content={
365
+ # "query": request.question,
366
+ # "chunks": chunks,
367
+ # "total_found": len(chunks)
368
+ # })
369
+
370
+ # except HTTPException:
371
+ # raise
372
+ # except Exception as e:
373
+ # logger.error(f"Search error: {e}", exc_info=True)
374
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
375
+
376
+ # @app.get("/health")
377
+ # async def health_check():
378
+ # return {
379
+ # "status": "healthy",
380
+ # "rag_system_initialized": rag_system is not None,
381
+ # "message": "RAG PDF QA System is running",
382
+ # "indexed_documents": len(rag_system.list_documents()) if rag_system else 0
383
+ # }
384
+
385
+ # @app.get("/stats")
386
+ # async def get_stats():
387
+ # try:
388
+ # if rag_system is None:
389
+ # raise HTTPException(status_code=500, detail="RAG System not initialized")
390
+
391
+ # docs = rag_system.list_documents()
392
+
393
+ # total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
394
+ # total_pages = sum(doc.get("total_pages", 1) for doc in docs)
395
+
396
+ # return JSONResponse(content={
397
+ # "total_documents": len(docs),
398
+ # "total_chunks": total_chunks,
399
+ # "total_pages": total_pages,
400
+ # "documents": docs
401
+ # })
402
+
403
+ # except HTTPException:
404
+ # raise
405
+ # except Exception as e:
406
+ # logger.error(f"Stats error: {e}", exc_info=True)
407
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
408
+
409
+ # @app.on_event("shutdown")
410
+ # async def shutdown_event():
411
+ # if rag_system:
412
+ # rag_system.close()
413
+ # logger.info("RAG System closed gracefully")
414
+
415
+ # if __name__ == "__main__":
416
+ # import uvicorn
417
+ # logger.info("Starting FastAPI server...")
418
+ # uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")
419
+
420
+
421
+
422
+ # complate code
423
+ # from fastapi import FastAPI, File, UploadFile, HTTPException, Request
424
+ # from fastapi.responses import HTMLResponse, JSONResponse
425
+ # from fastapi.staticfiles import StaticFiles
426
+ # from fastapi.templating import Jinja2Templates
427
+ # from pydantic import BaseModel
428
+ # import os
429
+ # import tempfile
430
+ # import shutil
431
+ # from typing import List, Dict, Any, Optional, Union
432
+ # import logging
433
+ # from datetime import datetime
434
+ # import mimetypes
435
+
436
+ # # Import your RAG system
437
+ # import sys
438
+ # sys.path.append(os.path.dirname(os.path.abspath(__file__)))
439
+ # try:
440
+ # from RAG import RAGSystem
441
+ # except ImportError:
442
+ # print("Error: Cannot import RAGSystem from RAG.py")
443
+ # print("Make sure RAG.py is in the same directory as app.py")
444
+ # sys.exit(1)
445
+
446
+ # # Configure logging
447
+ # logging.basicConfig(
448
+ # level=logging.INFO,
449
+ # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
450
+ # )
451
+ # logger = logging.getLogger(__name__)
452
+
453
+ # # Initialize FastAPI app
454
+ # app = FastAPI(
455
+ # title="Scholar's Archive - Document Intelligence System",
456
+ # description="A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology",
457
+ # version="1.0.0",
458
+ # docs_url="/api/docs",
459
+ # redoc_url="/api/redoc"
460
+ # )
461
+
462
+ # # Setup templates directory
463
+ # templates = Jinja2Templates(directory="templates")
464
+
465
+ # # Try to mount static files directory
466
+ # try:
467
+ # app.mount("/static", StaticFiles(directory="static"), name="static")
468
+ # except Exception as e:
469
+ # logger.warning(f"Static files directory not found: {e}")
470
+
471
+ # # Initialize RAG System
472
+ # try:
473
+ # rag_system = RAGSystem()
474
+ # logger.info("Scholar's Archive RAG System initialized successfully")
475
+ # except Exception as e:
476
+ # logger.error(f"Failed to initialize RAG System: {e}")
477
+ # rag_system = None
478
+
479
+ # # Pydantic models
480
+ # class QuestionRequest(BaseModel):
481
+ # question: str
482
+ # top_k: int = 3
483
+
484
+ # class DocumentInfo(BaseModel):
485
+ # title: str
486
+ # file_type: str
487
+ # upload_date: str
488
+ # chunk_count: int
489
+ # total_pages: Optional[int] = None
490
+
491
+ # # Fixed AnswerResponse model to handle both strings and dictionaries
492
+ # class AnswerResponse(BaseModel):
493
+ # answer: str
494
+ # sources: List[Dict[str, Any]]
495
+ # question_chunks: List[Union[str, Dict[str, Any]]] = []
496
+ # relevant_chunks: List[Union[str, Dict[str, Any]]] = []
497
+
498
+ # class StatsResponse(BaseModel):
499
+ # total_documents: int
500
+ # total_chunks: int
501
+ # total_pages: int
502
+ # documents: List[DocumentInfo]
503
+
504
+ # # Utility functions
505
+ # def get_file_type_icon(filename: str) -> str:
506
+ # """Get appropriate icon for file type"""
507
+ # ext = os.path.splitext(filename)[1].lower()
508
+ # icons = {
509
+ # '.pdf': 'fas fa-file-pdf',
510
+ # '.docx': 'fas fa-file-word',
511
+ # '.txt': 'fas fa-file-alt',
512
+ # '.csv': 'fas fa-file-csv'
513
+ # }
514
+ # return icons.get(ext, 'fas fa-file')
515
+
516
+ # def format_file_size(size_bytes: int) -> str:
517
+ # """Format file size in human readable format"""
518
+ # if size_bytes == 0:
519
+ # return "0 B"
520
+ # size_names = ["B", "KB", "MB", "GB"]
521
+ # i = 0
522
+ # while size_bytes >= 1024 and i < len(size_names) - 1:
523
+ # size_bytes /= 1024.0
524
+ # i += 1
525
+ # return f"{size_bytes:.1f} {size_names[i]}"
526
+
527
+ # def extract_content_from_chunks(chunks):
528
+ # """Extract string content from chunk data structures"""
529
+ # if not chunks:
530
+ # return []
531
+
532
+ # extracted = []
533
+ # for chunk in chunks:
534
+ # if isinstance(chunk, str):
535
+ # extracted.append(chunk)
536
+ # elif isinstance(chunk, dict):
537
+ # # Try different possible keys for text content
538
+ # content = chunk.get('text') or chunk.get('content') or chunk.get('document') or str(chunk)
539
+ # extracted.append(content)
540
+ # else:
541
+ # extracted.append(str(chunk))
542
+
543
+ # return extracted
544
+
545
+ # # Routes
546
+ # @app.get("/", response_class=HTMLResponse)
547
+ # async def read_root(request: Request):
548
+ # """Serve the main classical interface"""
549
+ # try:
550
+ # return templates.TemplateResponse("index.html", {"request": request})
551
+ # except Exception as e:
552
+ # logger.error(f"Error serving index.html from templates folder: {e}")
553
+ # # Return the embedded HTML if templates folder is not available
554
+ # with open("scholar_archive.html", "r", encoding="utf-8") as f:
555
+ # html_content = f.read()
556
+ # return HTMLResponse(content=html_content)
557
+
558
+ # @app.post("/upload")
559
+ # async def upload_document(file: UploadFile = File(...)):
560
+ # """Upload and process a document"""
561
+ # try:
562
+ # if rag_system is None:
563
+ # raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
564
+
565
+ # if not file.filename:
566
+ # raise HTTPException(status_code=400, detail="No file selected")
567
+
568
+ # # Validate file type
569
+ # allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
570
+ # file_extension = os.path.splitext(file.filename)[1].lower()
571
+
572
+ # if file_extension not in allowed_extensions:
573
+ # raise HTTPException(
574
+ # status_code=400,
575
+ # detail=f"File type {file_extension} not supported. Supported formats: {', '.join(allowed_extensions)}"
576
+ # )
577
+
578
+ # # Check file size (limit to 50MB)
579
+ # file_size = 0
580
+ # content = await file.read()
581
+ # file_size = len(content)
582
+
583
+ # if file_size > 50 * 1024 * 1024: # 50MB limit
584
+ # raise HTTPException(status_code=400, detail="File size too large. Maximum size is 50MB")
585
+
586
+ # # Create temporary file
587
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
588
+ # temp_file.write(content)
589
+ # temp_path = temp_file.name
590
+
591
+ # logger.info(f"Processing document: {file.filename} ({format_file_size(file_size)})")
592
+
593
+ # # Process document
594
+ # success = rag_system.add_document(temp_path)
595
+
596
+ # # Clean up temporary file
597
+ # try:
598
+ # os.unlink(temp_path)
599
+ # except Exception as cleanup_error:
600
+ # logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
601
+
602
+ # if success:
603
+ # logger.info(f"Successfully processed document: {file.filename}")
604
+ # return JSONResponse(content={
605
+ # "message": f"Document '{file.filename}' has been successfully added to the Scholar's Archive",
606
+ # "filename": file.filename,
607
+ # "size": format_file_size(file_size),
608
+ # "type": file_extension
609
+ # })
610
+ # else:
611
+ # raise HTTPException(status_code=500, detail="Failed to process document")
612
+
613
+ # except HTTPException:
614
+ # raise
615
+ # except Exception as e:
616
+ # logger.error(f"Upload error: {e}", exc_info=True)
617
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
618
+
619
+ # @app.post("/ask", response_model=AnswerResponse)
620
+ # async def ask_question(request: QuestionRequest):
621
+ # """Ask a question about the uploaded documents"""
622
+ # try:
623
+ # if rag_system is None:
624
+ # raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
625
+
626
+ # if not request.question.strip():
627
+ # raise HTTPException(status_code=400, detail="Question cannot be empty")
628
+
629
+ # logger.info(f"Processing question: {request.question[:100]}...")
630
+
631
+ # # Get answer from RAG system
632
+ # result = rag_system.ask_question(request.question, top_k=request.top_k)
633
+
634
+ # # Handle the chunks data properly
635
+ # question_chunks = result.get("question_chunks", [])
636
+ # relevant_chunks = result.get("relevant_chunks", [])
637
+
638
+ # # Log the structure to understand what we're getting
639
+ # logger.info(f"Question chunks type: {type(question_chunks)}")
640
+ # logger.info(f"Relevant chunks type: {type(relevant_chunks)}")
641
+ # if question_chunks:
642
+ # logger.info(f"First question chunk type: {type(question_chunks[0])}")
643
+ # if relevant_chunks:
644
+ # logger.info(f"First relevant chunk type: {type(relevant_chunks[0])}")
645
+
646
+ # # Format the response - keep original structure but ensure it's serializable
647
+ # response = AnswerResponse(
648
+ # answer=result["answer"],
649
+ # sources=result["sources"],
650
+ # question_chunks=question_chunks,
651
+ # relevant_chunks=relevant_chunks
652
+ # )
653
+
654
+ # logger.info(f"Successfully answered question with {len(result['sources'])} sources")
655
+
656
+ # return response
657
+
658
+ # except HTTPException:
659
+ # raise
660
+ # except Exception as e:
661
+ # logger.error(f"Question processing error: {e}", exc_info=True)
662
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
663
+
664
+ # @app.get("/documents")
665
+ # async def get_documents():
666
+ # """Get list of all uploaded documents"""
667
+ # try:
668
+ # if rag_system is None:
669
+ # raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
670
+
671
+ # docs = rag_system.list_documents()
672
+
673
+ # # Format documents with additional metadata
674
+ # formatted_docs = []
675
+ # for doc in docs:
676
+ # formatted_doc = {
677
+ # "title": doc.get("title", "Unknown Document"),
678
+ # "chunk_count": doc.get("chunk_count", 0),
679
+ # "total_pages": doc.get("total_pages"),
680
+ # "file_type": os.path.splitext(doc.get("title", ""))[1].lower(),
681
+ # "upload_date": doc.get("upload_date", datetime.now().isoformat()),
682
+ # "icon": get_file_type_icon(doc.get("title", ""))
683
+ # }
684
+ # formatted_docs.append(formatted_doc)
685
+
686
+ # return JSONResponse(content={"documents": formatted_docs})
687
+
688
+ # except HTTPException:
689
+ # raise
690
+ # except Exception as e:
691
+ # logger.error(f"Documents list error: {e}", exc_info=True)
692
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
693
+
694
+ # @app.get("/stats", response_model=StatsResponse)
695
+ # async def get_stats():
696
+ # """Get statistics about the document collection"""
697
+ # try:
698
+ # if rag_system is None:
699
+ # raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
700
+
701
+ # docs = rag_system.list_documents()
702
+
703
+ # total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
704
+ # total_pages = sum(doc.get("total_pages", 1) for doc in docs if doc.get("total_pages"))
705
+
706
+ # # Format documents
707
+ # formatted_docs = []
708
+ # for doc in docs:
709
+ # formatted_doc = DocumentInfo(
710
+ # title=doc.get("title", "Unknown Document"),
711
+ # file_type=os.path.splitext(doc.get("title", ""))[1].lower(),
712
+ # upload_date=doc.get("upload_date", datetime.now().isoformat()),
713
+ # chunk_count=doc.get("chunk_count", 0),
714
+ # total_pages=doc.get("total_pages")
715
+ # )
716
+ # formatted_docs.append(formatted_doc)
717
+
718
+ # stats = StatsResponse(
719
+ # total_documents=len(docs),
720
+ # total_chunks=total_chunks,
721
+ # total_pages=total_pages,
722
+ # documents=formatted_docs
723
+ # )
724
+
725
+ # return stats
726
+
727
+ # except HTTPException:
728
+ # raise
729
+ # except Exception as e:
730
+ # logger.error(f"Stats error: {e}", exc_info=True)
731
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
732
+
733
+ # @app.delete("/clear")
734
+ # async def clear_documents():
735
+ # """Clear all documents from the archive"""
736
+ # try:
737
+ # if rag_system is None:
738
+ # raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
739
+
740
+ # logger.info("Clearing all documents from Scholar's Archive")
741
+
742
+ # success = rag_system.clear_index()
743
+ # if success:
744
+ # logger.info("Successfully cleared all documents")
745
+ # return JSONResponse(content={
746
+ # "message": "All documents have been successfully removed from the Scholar's Archive"
747
+ # })
748
+ # else:
749
+ # raise HTTPException(status_code=500, detail="Failed to clear documents")
750
+
751
+ # except HTTPException:
752
+ # raise
753
+ # except Exception as e:
754
+ # logger.error(f"Clear error: {e}", exc_info=True)
755
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
756
+
757
+ # @app.post("/search")
758
+ # async def search_chunks(request: QuestionRequest):
759
+ # """Search for relevant document chunks"""
760
+ # try:
761
+ # if rag_system is None:
762
+ # raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
763
+
764
+ # if not request.question.strip():
765
+ # raise HTTPException(status_code=400, detail="Search query cannot be empty")
766
+
767
+ # logger.info(f"Searching chunks for: {request.question[:100]}...")
768
+
769
+ # chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
770
+
771
+ # # Format chunks with additional metadata
772
+ # formatted_chunks = []
773
+ # for chunk in chunks:
774
+ # formatted_chunk = {
775
+ # "content": chunk.get("content", ""),
776
+ # "document": chunk.get("document", "Unknown"),
777
+ # "similarity": chunk.get("similarity", 0.0),
778
+ # "page": chunk.get("page"),
779
+ # "chunk_index": chunk.get("chunk_index")
780
+ # }
781
+ # formatted_chunks.append(formatted_chunk)
782
+
783
+ # return JSONResponse(content={
784
+ # "query": request.question,
785
+ # "chunks": formatted_chunks,
786
+ # "total_found": len(formatted_chunks)
787
+ # })
788
+
789
+ # except HTTPException:
790
+ # raise
791
+ # except Exception as e:
792
+ # logger.error(f"Search error: {e}", exc_info=True)
793
+ # raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
794
+
795
+ # @app.get("/health")
796
+ # async def health_check():
797
+ # """Health check endpoint"""
798
+ # try:
799
+ # doc_count = len(rag_system.list_documents()) if rag_system else 0
800
+
801
+ # return {
802
+ # "status": "healthy",
803
+ # "service": "Scholar's Archive - Document Intelligence System",
804
+ # "version": "1.0.0",
805
+ # "rag_system_initialized": rag_system is not None,
806
+ # "indexed_documents": doc_count,
807
+ # "timestamp": datetime.now().isoformat(),
808
+ # "message": "Scholar's Archive is operational and ready to serve"
809
+ # }
810
+ # except Exception as e:
811
+ # logger.error(f"Health check error: {e}")
812
+ # return {
813
+ # "status": "degraded",
814
+ # "service": "Scholar's Archive - Document Intelligence System",
815
+ # "error": str(e),
816
+ # "timestamp": datetime.now().isoformat()
817
+ # }
818
+
819
+ # @app.get("/api/info")
820
+ # async def api_info():
821
+ # """Get API information"""
822
+ # return {
823
+ # "name": "Scholar's Archive API",
824
+ # "description": "Document Intelligence System API",
825
+ # "version": "1.0.0",
826
+ # "endpoints": {
827
+ # "upload": "POST /upload - Upload documents",
828
+ # "ask": "POST /ask - Ask questions",
829
+ # "documents": "GET /documents - List documents",
830
+ # "stats": "GET /stats - Get statistics",
831
+ # "search": "POST /search - Search chunks",
832
+ # "clear": "DELETE /clear - Clear all documents",
833
+ # "health": "GET /health - Health check"
834
+ # },
835
+ # "supported_formats": [".pdf", ".docx", ".txt", ".csv"],
836
+ # "max_file_size": "50MB"
837
+ # }
838
+
839
+ # # Event handlers
840
+ # @app.on_event("startup")
841
+ # async def startup_event():
842
+ # """Application startup event"""
843
+ # logger.info("Starting Scholar's Archive - Document Intelligence System")
844
+ # logger.info("System initialized and ready to serve scholarly inquiries")
845
+
846
+ # @app.on_event("shutdown")
847
+ # async def shutdown_event():
848
+ # """Application shutdown event"""
849
+ # if rag_system:
850
+ # rag_system.close()
851
+ # logger.info("Scholar's Archive system closed gracefully")
852
+ # logger.info("Scholar's Archive shutdown complete")
853
+
854
+ # # Error handlers
855
+ # @app.exception_handler(404)
856
+ # async def not_found_handler(request: Request, exc):
857
+ # """Custom 404 handler"""
858
+ # return JSONResponse(
859
+ # status_code=404,
860
+ # content={
861
+ # "detail": "The requested resource was not found in the Scholar's Archive",
862
+ # "path": str(request.url.path)
863
+ # }
864
+ # )
865
+
866
+ # @app.exception_handler(500)
867
+ # async def internal_error_handler(request: Request, exc):
868
+ # """Custom 500 handler"""
869
+ # logger.error(f"Internal server error: {exc}")
870
+ # return JSONResponse(
871
+ # status_code=500,
872
+ # content={
873
+ # "detail": "An internal error occurred in the Scholar's Archive system",
874
+ # "message": "Please try again later or contact support"
875
+ # }
876
+ # )
877
+
878
+ # # Main execution
879
+ # if __name__ == "__main__":
880
+ # import uvicorn
881
+
882
+ # logger.info("Launching Scholar's Archive - Document Intelligence System")
883
+ # logger.info("Access the interface at: http://localhost:8000")
884
+ # logger.info("API documentation at: http://localhost:8000/api/docs")
885
+
886
+ # uvicorn.run(
887
+ # app,
888
+ # host="0.0.0.0",
889
+ # port=7860,
890
+ # log_level="info",
891
+ # reload=False,
892
+ # access_log=True
893
+ # )
894
+
895
+
896
+
897
+
898
+
899
+ # perfect code
900
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Request
901
+ from fastapi.responses import HTMLResponse, JSONResponse
902
+ from fastapi.staticfiles import StaticFiles
903
+ from fastapi.templating import Jinja2Templates
904
+ from pydantic import BaseModel
905
+ import os
906
+ import tempfile
907
+ import shutil
908
+ from typing import List, Dict, Any, Optional, Union
909
+ import logging
910
+ from datetime import datetime
911
+ import mimetypes
912
+
913
+ # Import your RAG system
914
+ import sys
915
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
916
+ try:
917
+ from RAG import RAGSystem
918
+ except ImportError:
919
+ print("Error: Cannot import RAGSystem from RAG.py")
920
+ print("Make sure RAG.py is in the same directory as app.py")
921
+ sys.exit(1)
922
+
923
+ # Configure logging
924
+ logging.basicConfig(
925
+ level=logging.INFO,
926
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
927
+ )
928
+ logger = logging.getLogger(__name__)
929
+
930
+ # Initialize FastAPI app
931
+ app = FastAPI(
932
+ title="Scholar's Archive - Document Intelligence System",
933
+ description="A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology",
934
+ version="1.0.0",
935
+ docs_url="/api/docs",
936
+ redoc_url="/api/redoc"
937
+ )
938
+
939
+ # Setup templates directory
940
+ templates = Jinja2Templates(directory="templates")
941
+
942
+ # Try to mount static files directory
943
+ try:
944
+ app.mount("/static", StaticFiles(directory="static"), name="static")
945
+ except Exception as e:
946
+ logger.warning(f"Static files directory not found: {e}")
947
+
948
+ # Initialize RAG System
949
+ try:
950
+ rag_system = RAGSystem()
951
+ logger.info("Scholar's Archive RAG System initialized successfully")
952
+ except Exception as e:
953
+ logger.error(f"Failed to initialize RAG System: {e}")
954
+ rag_system = None
955
+
956
+ # Pydantic models
957
+ class QuestionRequest(BaseModel):
958
+ question: str
959
+ top_k: int = 3
960
+
961
+ class DocumentInfo(BaseModel):
962
+ title: str
963
+ file_type: str
964
+ upload_date: str
965
+ chunk_count: int
966
+ total_pages: Optional[int] = None
967
+
968
+ class AnswerResponse(BaseModel):
969
+ answer: str
970
+ sources: List[Dict[str, Any]]
971
+ question_chunks: List[Union[str, Dict[str, Any]]] = []
972
+ relevant_chunks: List[Union[str, Dict[str, Any]]] = []
973
+
974
+ class StatsResponse(BaseModel):
975
+ total_documents: int
976
+ total_chunks: int
977
+ total_pages: int
978
+ documents: List[DocumentInfo]
979
+
980
+ # Utility functions
981
+ def get_file_type_icon(filename: str) -> str:
982
+ """Get appropriate icon for file type"""
983
+ ext = os.path.splitext(filename)[1].lower()
984
+ icons = {
985
+ '.pdf': 'fas fa-file-pdf',
986
+ '.docx': 'fas fa-file-word',
987
+ '.txt': 'fas fa-file-alt',
988
+ '.csv': 'fas fa-file-csv'
989
+ }
990
+ return icons.get(ext, 'fas fa-file')
991
+
992
+ def format_file_size(size_bytes: int) -> str:
993
+ """Format file size in human readable format"""
994
+ if size_bytes == 0:
995
+ return "0 B"
996
+ size_names = ["B", "KB", "MB", "GB"]
997
+ i = 0
998
+ while size_bytes >= 1024 and i < len(size_names) - 1:
999
+ size_bytes /= 1024.0
1000
+ i += 1
1001
+ return f"{size_bytes:.1f} {size_names[i]}"
1002
+
1003
+ def extract_content_from_chunks(chunks):
1004
+ """Extract string content from chunk data structures"""
1005
+ if not chunks:
1006
+ return []
1007
+
1008
+ extracted = []
1009
+ for chunk in chunks:
1010
+ if isinstance(chunk, str):
1011
+ extracted.append(chunk)
1012
+ elif isinstance(chunk, dict):
1013
+ # Try different possible keys for text content
1014
+ content = chunk.get('text') or chunk.get('content') or chunk.get('document') or str(chunk)
1015
+ extracted.append(content)
1016
+ else:
1017
+ extracted.append(str(chunk))
1018
+
1019
+ return extracted
1020
+
1021
+
1022
+
1023
+ # Routes
1024
+ @app.get("/", response_class=HTMLResponse)
1025
+ async def read_root(request: Request):
1026
+ """Serve the main classical interface"""
1027
+ try:
1028
+ return templates.TemplateResponse("index.html", {"request": request})
1029
+ except Exception as e:
1030
+ logger.error(f"Error serving index.html from templates folder: {e}")
1031
+ # Return the embedded HTML if templates folder is not available
1032
+ with open("scholar_archive.html", "r", encoding="utf-8") as f:
1033
+ html_content = f.read()
1034
+ return HTMLResponse(content=html_content)
1035
+
1036
+ @app.post("/upload")
1037
+ async def upload_document(file: UploadFile = File(...)):
1038
+ """Upload and process a document"""
1039
+ try:
1040
+ if rag_system is None:
1041
+ raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
1042
+
1043
+ if not file.filename:
1044
+ raise HTTPException(status_code=400, detail="No file selected")
1045
+
1046
+ # Validate file type
1047
+ allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
1048
+ file_extension = os.path.splitext(file.filename)[1].lower()
1049
+
1050
+ if file_extension not in allowed_extensions:
1051
+ raise HTTPException(
1052
+ status_code=400,
1053
+ detail=f"File type {file_extension} not supported. Supported formats: {', '.join(allowed_extensions)}"
1054
+ )
1055
+
1056
+ # Check file size (limit to 50MB)
1057
+ file_size = 0
1058
+ content = await file.read()
1059
+ file_size = len(content)
1060
+
1061
+ if file_size > 50 * 1024 * 1024: # 50MB limit
1062
+ raise HTTPException(status_code=400, detail="File size too large. Maximum size is 50MB")
1063
+
1064
+ # Create temporary file
1065
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
1066
+ temp_file.write(content)
1067
+ temp_path = temp_file.name
1068
+
1069
+ logger.info(f"Processing document: {file.filename} ({format_file_size(file_size)})")
1070
+
1071
+ # Process document
1072
+ success = rag_system.add_document(temp_path)
1073
+
1074
+ # Clean up temporary file
1075
+ try:
1076
+ os.unlink(temp_path)
1077
+ except Exception as cleanup_error:
1078
+ logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
1079
+
1080
+ if success:
1081
+ logger.info(f"Successfully processed document: {file.filename}")
1082
+ return JSONResponse(content={
1083
+ "message": f"Document '{file.filename}' has been successfully added to the Scholar's Archive",
1084
+ "filename": file.filename,
1085
+ "size": format_file_size(file_size),
1086
+ "type": file_extension
1087
+ })
1088
+ else:
1089
+ raise HTTPException(status_code=500, detail="Failed to process document")
1090
+
1091
+ except HTTPException:
1092
+ raise
1093
+ except Exception as e:
1094
+ logger.error(f"Upload error: {e}", exc_info=True)
1095
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
1096
+
1097
+ @app.post("/ask", response_model=AnswerResponse)
1098
+ async def ask_question(request: QuestionRequest):
1099
+ """Ask a question about the uploaded documents"""
1100
+ try:
1101
+ if rag_system is None:
1102
+ raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
1103
+
1104
+ if not request.question.strip():
1105
+ raise HTTPException(status_code=400, detail="Question cannot be empty")
1106
+
1107
+ logger.info(f"Processing question: {request.question[:100]}...")
1108
+
1109
+ # Get answer from RAG system
1110
+ result = rag_system.ask_question(request.question, top_k=request.top_k)
1111
+
1112
+ # Handle the chunks data properly
1113
+ question_chunks = result.get("question_chunks", [])
1114
+ relevant_chunks = result.get("relevant_chunks", [])
1115
+
1116
+ # Add page numbers to sources
1117
+ sources = result.get("sources", [])
1118
+ for source in sources:
1119
+ if isinstance(source, dict):
1120
+ page_num = source.get('page')
1121
+ if page_num:
1122
+ source['page_reference'] = f"Page {page_num}"
1123
+
1124
+ # Log the structure to understand what we're getting
1125
+ logger.info(f"Question chunks type: {type(question_chunks)}")
1126
+ logger.info(f"Relevant chunks type: {type(relevant_chunks)}")
1127
+ if question_chunks:
1128
+ logger.info(f"First question chunk type: {type(question_chunks[0])}")
1129
+ if relevant_chunks:
1130
+ logger.info(f"First relevant chunk type: {type(relevant_chunks[0])}")
1131
+
1132
+ # Format the response - keep original structure but ensure it's serializable
1133
+ response = AnswerResponse(
1134
+ answer=result["answer"],
1135
+ sources=sources,
1136
+ question_chunks=question_chunks,
1137
+ relevant_chunks=relevant_chunks
1138
+ )
1139
+
1140
+ logger.info(f"Successfully answered question with {len(sources)} sources")
1141
+
1142
+ return response
1143
+
1144
+ except HTTPException:
1145
+ raise
1146
+ except Exception as e:
1147
+ logger.error(f"Question processing error: {e}", exc_info=True)
1148
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
1149
+
1150
+ @app.get("/documents")
1151
+ async def get_documents():
1152
+ """Get list of all uploaded documents"""
1153
+ try:
1154
+ if rag_system is None:
1155
+ raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
1156
+
1157
+ docs = rag_system.list_documents()
1158
+
1159
+ # Format documents with additional metadata
1160
+ formatted_docs = []
1161
+ for doc in docs:
1162
+ formatted_doc = {
1163
+ "title": doc.get("title", "Unknown Document"),
1164
+ "chunk_count": doc.get("chunk_count", 0),
1165
+ "total_pages": doc.get("total_pages"),
1166
+ "file_type": os.path.splitext(doc.get("title", ""))[1].lower(),
1167
+ "upload_date": doc.get("upload_date", datetime.now().isoformat()),
1168
+ "icon": get_file_type_icon(doc.get("title", ""))
1169
+ }
1170
+ formatted_docs.append(formatted_doc)
1171
+
1172
+ return JSONResponse(content={"documents": formatted_docs})
1173
+
1174
+ except HTTPException:
1175
+ raise
1176
+ except Exception as e:
1177
+ logger.error(f"Documents list error: {e}", exc_info=True)
1178
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
1179
+
1180
+ @app.get("/stats", response_model=StatsResponse)
1181
+ async def get_stats():
1182
+ """Get statistics about the document collection"""
1183
+ try:
1184
+ if rag_system is None:
1185
+ raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
1186
+
1187
+ docs = rag_system.list_documents()
1188
+
1189
+ total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
1190
+ total_pages = sum(doc.get("total_pages", 1) for doc in docs if doc.get("total_pages"))
1191
+
1192
+ # Format documents
1193
+ formatted_docs = []
1194
+ for doc in docs:
1195
+ formatted_doc = DocumentInfo(
1196
+ title=doc.get("title", "Unknown Document"),
1197
+ file_type=os.path.splitext(doc.get("title", ""))[1].lower(),
1198
+ upload_date=doc.get("upload_date", datetime.now().isoformat()),
1199
+ chunk_count=doc.get("chunk_count", 0),
1200
+ total_pages=doc.get("total_pages")
1201
+ )
1202
+ formatted_docs.append(formatted_doc)
1203
+
1204
+ stats = StatsResponse(
1205
+ total_documents=len(docs),
1206
+ total_chunks=total_chunks,
1207
+ total_pages=total_pages,
1208
+ documents=formatted_docs
1209
+ )
1210
+
1211
+ return stats
1212
+
1213
+ except HTTPException:
1214
+ raise
1215
+ except Exception as e:
1216
+ logger.error(f"Stats error: {e}", exc_info=True)
1217
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
1218
+
1219
+ @app.delete("/clear")
1220
+ async def clear_documents():
1221
+ """Clear all documents from the archive"""
1222
+ try:
1223
+ if rag_system is None:
1224
+ raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
1225
+
1226
+ logger.info("Clearing all documents from Scholar's Archive")
1227
+
1228
+ success = rag_system.clear_index()
1229
+ if success:
1230
+ logger.info("Successfully cleared all documents")
1231
+ return JSONResponse(content={
1232
+ "message": "All documents have been successfully removed from the Scholar's Archive"
1233
+ })
1234
+ else:
1235
+ raise HTTPException(status_code=500, detail="Failed to clear documents")
1236
+
1237
+ except HTTPException:
1238
+ raise
1239
+ except Exception as e:
1240
+ logger.error(f"Clear error: {e}", exc_info=True)
1241
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
1242
+
1243
+ @app.post("/search")
1244
+ async def search_chunks(request: QuestionRequest):
1245
+ """Search for relevant document chunks"""
1246
+ try:
1247
+ if rag_system is None:
1248
+ raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
1249
+
1250
+ if not request.question.strip():
1251
+ raise HTTPException(status_code=400, detail="Search query cannot be empty")
1252
+
1253
+ logger.info(f"Searching chunks for: {request.question[:100]}...")
1254
+
1255
+ chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
1256
+
1257
+ # Format chunks with additional metadata
1258
+ formatted_chunks = []
1259
+ for chunk in chunks:
1260
+ formatted_chunk = {
1261
+ "content": chunk.get("content", ""),
1262
+ "document": chunk.get("document", "Unknown"),
1263
+ "similarity": chunk.get("similarity", 0.0),
1264
+ "page": chunk.get("page"),
1265
+ "chunk_index": chunk.get("chunk_index")
1266
+ }
1267
+ formatted_chunks.append(formatted_chunk)
1268
+
1269
+ return JSONResponse(content={
1270
+ "query": request.question,
1271
+ "chunks": formatted_chunks,
1272
+ "total_found": len(formatted_chunks)
1273
+ })
1274
+
1275
+ except HTTPException:
1276
+ raise
1277
+ except Exception as e:
1278
+ logger.error(f"Search error: {e}", exc_info=True)
1279
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
1280
+
1281
+ @app.get("/health")
1282
+ async def health_check():
1283
+ """Health check endpoint"""
1284
+ try:
1285
+ doc_count = len(rag_system.list_documents()) if rag_system else 0
1286
+
1287
+ return {
1288
+ "status": "healthy",
1289
+ "service": "Scholar's Archive - Document Intelligence System",
1290
+ "version": "1.0.0",
1291
+ "rag_system_initialized": rag_system is not None,
1292
+ "indexed_documents": doc_count,
1293
+ "timestamp": datetime.now().isoformat(),
1294
+ "message": "Scholar's Archive is operational and ready to serve"
1295
+ }
1296
+ except Exception as e:
1297
+ logger.error(f"Health check error: {e}")
1298
+ return {
1299
+ "status": "degraded",
1300
+ "service": "Scholar's Archive - Document Intelligence System",
1301
+ "error": str(e),
1302
+ "timestamp": datetime.now().isoformat()
1303
+ }
1304
+
1305
+ @app.get("/api/info")
1306
+ async def api_info():
1307
+ """Get API information"""
1308
+ return {
1309
+ "name": "Scholar's Archive API",
1310
+ "description": "Document Intelligence System API",
1311
+ "version": "1.0.0",
1312
+ "endpoints": {
1313
+ "upload": "POST /upload - Upload documents",
1314
+ "ask": "POST /ask - Ask questions",
1315
+ "documents": "GET /documents - List documents",
1316
+ "stats": "GET /stats - Get statistics",
1317
+ "search": "POST /search - Search chunks",
1318
+ "clear": "DELETE /clear - Clear all documents",
1319
+ "health": "GET /health - Health check"
1320
+ },
1321
+ "supported_formats": [".pdf", ".docx", ".txt", ".csv"],
1322
+ "max_file_size": "50MB"
1323
+ }
1324
+
1325
+ # Event handlers
1326
+ @app.on_event("startup")
1327
+ async def startup_event():
1328
+ """Application startup event"""
1329
+ logger.info("Starting Scholar's Archive - Document Intelligence System")
1330
+ logger.info("System initialized and ready to serve scholarly inquiries")
1331
+
1332
+ @app.on_event("shutdown")
1333
+ async def shutdown_event():
1334
+ """Application shutdown event"""
1335
+ if rag_system:
1336
+ rag_system.close()
1337
+ logger.info("Scholar's Archive system closed gracefully")
1338
+ logger.info("Scholar's Archive shutdown complete")
1339
+
1340
+ # Error handlers
1341
+ @app.exception_handler(404)
1342
+ async def not_found_handler(request: Request, exc):
1343
+ """Custom 404 handler"""
1344
+ return JSONResponse(
1345
+ status_code=404,
1346
+ content={
1347
+ "detail": "The requested resource was not found in the Scholar's Archive",
1348
+ "path": str(request.url.path)
1349
+ }
1350
+ )
1351
+
1352
+ @app.exception_handler(500)
1353
+ async def internal_error_handler(request: Request, exc):
1354
+ """Custom 500 handler"""
1355
+ logger.error(f"Internal server error: {exc}")
1356
+ return JSONResponse(
1357
+ status_code=500,
1358
+ content={
1359
+ "detail": "An internal error occurred in the Scholar's Archive system",
1360
+ "message": "Please try again later or contact support"
1361
+ }
1362
+ )
1363
+
1364
+ # Main execution
1365
+ if __name__ == "__main__":
1366
+ import uvicorn
1367
+
1368
+ logger.info("Launching Scholar's Archive - Document Intelligence System")
1369
+ logger.info("Access the interface at: http://localhost:8000")
1370
+ logger.info("API documentation at: http://localhost:8000/api/docs")
1371
+
1372
+ uvicorn.run(
1373
+ app,
1374
+ host="0.0.0.0",
1375
+ port=7860,
1376
+ log_level="info",
1377
+ reload=False,
1378
+ access_log=True
1379
+ )
rag_storage/metadata.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea47f16e743fe3b0236ad60c5bf23262ac2a654bf3471d0ef3da50af2813bd3f
3
+ size 53947
rag_storage/vector_store.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51fa7faf223455f98a8e566eca56bcbb9604a37dbb2a41c80bf32bfd60454d8d
3
+ size 365613
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ gradio==4.44.1
4
+ sentence-transformers
5
+ nltk
6
+ pymupdf
7
+ numpy
8
+ faiss-cpu
9
+ torch
10
+ transformers
11
+ unstructured
12
+ python-multipart
13
+ Jinja2
templates/index.html ADDED
@@ -0,0 +1,1338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>RAG PDF Question Answering System</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: 'Times New Roman', serif;
16
+ line-height: 1.6;
17
+ color: #333;
18
+ background-color: #f8f9fa;
19
+ padding: 20px;
20
+ }
21
+
22
+ .container {
23
+ max-width: 1000px;
24
+ margin: 0 auto;
25
+ background: white;
26
+ border-radius: 8px;
27
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
28
+ overflow: hidden;
29
+ }
30
+
31
+ .header {
32
+ background: linear-gradient(135deg, #2c3e50, #34495e);
33
+ color: white;
34
+ padding: 30px;
35
+ text-align: center;
36
+ }
37
+
38
+ .header h1 {
39
+ font-size: 2.2em;
40
+ margin-bottom: 10px;
41
+ font-weight: normal;
42
+ }
43
+
44
+ .header p {
45
+ font-size: 1.1em;
46
+ opacity: 0.9;
47
+ }
48
+
49
+ .main-content {
50
+ padding: 30px;
51
+ }
52
+
53
+ .section {
54
+ margin-bottom: 30px;
55
+ padding: 25px;
56
+ border: 1px solid #e0e0e0;
57
+ border-radius: 6px;
58
+ background: #fafafa;
59
+ }
60
+
61
+ .section h2 {
62
+ color: #2c3e50;
63
+ margin-bottom: 15px;
64
+ font-size: 1.4em;
65
+ border-bottom: 2px solid #3498db;
66
+ padding-bottom: 8px;
67
+ }
68
+
69
+ .upload-area {
70
+ border: 2px dashed #bdc3c7;
71
+ border-radius: 6px;
72
+ padding: 20px;
73
+ text-align: center;
74
+ background: white;
75
+ margin-bottom: 15px;
76
+ transition: border-color 0.3s ease;
77
+ }
78
+
79
+ .upload-area:hover {
80
+ border-color: #3498db;
81
+ }
82
+
83
+ .file-input {
84
+ margin: 10px 0;
85
+ }
86
+
87
+ input[type="file"] {
88
+ padding: 8px;
89
+ border: 1px solid #ddd;
90
+ border-radius: 4px;
91
+ font-family: inherit;
92
+ }
93
+
94
+ .btn {
95
+ background: #3498db;
96
+ color: white;
97
+ border: none;
98
+ padding: 12px 24px;
99
+ border-radius: 4px;
100
+ cursor: pointer;
101
+ font-size: 1em;
102
+ font-family: inherit;
103
+ transition: background 0.3s ease;
104
+ }
105
+
106
+ .btn:hover {
107
+ background: #2980b9;
108
+ }
109
+
110
+ .btn:disabled {
111
+ background: #bdc3c7;
112
+ cursor: not-allowed;
113
+ }
114
+
115
+ .btn-danger {
116
+ background: #e74c3c;
117
+ }
118
+
119
+ .btn-danger:hover {
120
+ background: #c0392b;
121
+ }
122
+
123
+ .question-area {
124
+ background: white;
125
+ padding: 20px;
126
+ border-radius: 6px;
127
+ border: 1px solid #ddd;
128
+ }
129
+
130
+ textarea {
131
+ width: 100%;
132
+ padding: 12px;
133
+ border: 1px solid #ddd;
134
+ border-radius: 4px;
135
+ font-family: inherit;
136
+ font-size: 1em;
137
+ resize: vertical;
138
+ min-height: 80px;
139
+ }
140
+
141
+ .answer-section {
142
+ background: white;
143
+ padding: 20px;
144
+ border-radius: 6px;
145
+ border: 1px solid #ddd;
146
+ margin-top: 15px;
147
+ }
148
+
149
+ .answer {
150
+ background: #f8f9fa;
151
+ padding: 15px;
152
+ border-left: 4px solid #3498db;
153
+ margin-bottom: 15px;
154
+ border-radius: 0 4px 4px 0;
155
+ }
156
+
157
+ .sources {
158
+ background: #fff;
159
+ border: 1px solid #e0e0e0;
160
+ border-radius: 4px;
161
+ padding: 15px;
162
+ }
163
+
164
+ .sources h4 {
165
+ color: #2c3e50;
166
+ margin-bottom: 10px;
167
+ }
168
+
169
+ .source-item {
170
+ padding: 8px 0;
171
+ border-bottom: 1px solid #eee;
172
+ }
173
+
174
+ .source-item:last-child {
175
+ border-bottom: none;
176
+ }
177
+
178
+ .documents-list {
179
+ background: white;
180
+ border-radius: 6px;
181
+ border: 1px solid #ddd;
182
+ max-height: 200px;
183
+ overflow-y: auto;
184
+ }
185
+
186
+ .document-item {
187
+ padding: 12px 15px;
188
+ border-bottom: 1px solid #eee;
189
+ display: flex;
190
+ justify-content: space-between;
191
+ align-items: center;
192
+ }
193
+
194
+ .document-item:last-child {
195
+ border-bottom: none;
196
+ }
197
+
198
+ .document-name {
199
+ font-weight: bold;
200
+ color: #2c3e50;
201
+ }
202
+
203
+ .document-chunks {
204
+ color: #7f8c8d;
205
+ font-size: 0.9em;
206
+ }
207
+
208
+ .status-message {
209
+ padding: 12px;
210
+ border-radius: 4px;
211
+ margin: 10px 0;
212
+ font-weight: bold;
213
+ }
214
+
215
+ .status-success {
216
+ background: #d4edda;
217
+ color: #155724;
218
+ border: 1px solid #c3e6cb;
219
+ }
220
+
221
+ .status-error {
222
+ background: #f8d7da;
223
+ color: #721c24;
224
+ border: 1px solid #f5c6cb;
225
+ }
226
+
227
+ .loading {
228
+ display: inline-block;
229
+ width: 20px;
230
+ height: 20px;
231
+ border: 3px solid #f3f3f3;
232
+ border-top: 3px solid #3498db;
233
+ border-radius: 50%;
234
+ animation: spin 1s linear infinite;
235
+ margin-right: 10px;
236
+ }
237
+
238
+ @keyframes spin {
239
+ 0% { transform: rotate(0deg); }
240
+ 100% { transform: rotate(360deg); }
241
+ }
242
+
243
+ .hidden {
244
+ display: none;
245
+ }
246
+
247
+ .controls {
248
+ display: flex;
249
+ gap: 10px;
250
+ align-items: center;
251
+ flex-wrap: wrap;
252
+ }
253
+
254
+ .no-documents {
255
+ text-align: center;
256
+ color: #7f8c8d;
257
+ padding: 20px;
258
+ font-style: italic;
259
+ }
260
+ </style>
261
+ </head>
262
+ <body>
263
+ <div class="container">
264
+ <div class="header">
265
+ <h1>RAG PDF Question Answering System</h1>
266
+ <p>Upload documents and ask questions to get AI-powered answers</p>
267
+ </div>
268
+
269
+ <div class="main-content">
270
+ <div class="section">
271
+ <h2>📁 Upload Documents</h2>
272
+ <div class="upload-area">
273
+ <p>Select a document to upload (PDF, DOCX, TXT, CSV)</p>
274
+ <div class="file-input">
275
+ <input type="file" id="fileInput" accept=".pdf,.docx,.txt,.csv">
276
+ </div>
277
+ <div class="controls">
278
+ <button class="btn" onclick="uploadDocument()" id="uploadBtn">Upload Document</button>
279
+ <button class="btn btn-danger" onclick="clearAllDocuments()" id="clearBtn">Clear All Documents</button>
280
+ </div>
281
+ </div>
282
+ <div id="uploadStatus"></div>
283
+ </div>
284
+
285
+ <div class="section">
286
+ <h2>📚 Indexed Documents</h2>
287
+ <div id="documentsList" class="documents-list">
288
+ <div class="no-documents">No documents uploaded yet</div>
289
+ </div>
290
+ </div>
291
+
292
+ <div class="section">
293
+ <h2>❓ Ask Questions</h2>
294
+ <div class="question-area">
295
+ <textarea id="questionInput" placeholder="Enter your question about the uploaded documents..."></textarea>
296
+ <div style="margin-top: 15px;">
297
+ <button class="btn" onclick="askQuestion()" id="askBtn">Ask Question</button>
298
+ </div>
299
+ </div>
300
+ <div id="answerSection" class="answer-section hidden">
301
+ <div id="answerContent" class="answer"></div>
302
+ <div id="sourcesContent" class="sources"></div>
303
+ </div>
304
+ </div>
305
+ </div>
306
+ </div>
307
+
308
+ <script>
309
+ let isUploading = false;
310
+ let isAsking = false;
311
+
312
+ function showMessage(message, type) {
313
+ const statusDiv = document.getElementById('uploadStatus');
314
+ statusDiv.innerHTML = `<div class="status-message status-${type}">${message}</div>`;
315
+ setTimeout(() => {
316
+ statusDiv.innerHTML = '';
317
+ }, 5000);
318
+ }
319
+
320
+ function setLoadingState(isLoading, buttonId, loadingText, normalText) {
321
+ const button = document.getElementById(buttonId);
322
+ if (isLoading) {
323
+ button.innerHTML = `<span class="loading"></span>${loadingText}`;
324
+ button.disabled = true;
325
+ } else {
326
+ button.innerHTML = normalText;
327
+ button.disabled = false;
328
+ }
329
+ }
330
+
331
+ async function uploadDocument() {
332
+ const fileInput = document.getElementById('fileInput');
333
+ const file = fileInput.files[0];
334
+
335
+ if (!file) {
336
+ showMessage('Please select a file first', 'error');
337
+ return;
338
+ }
339
+
340
+ isUploading = true;
341
+ setLoadingState(true, 'uploadBtn', 'Uploading...', 'Upload Document');
342
+
343
+ const formData = new FormData();
344
+ formData.append('file', file);
345
+
346
+ try {
347
+ const response = await fetch('/upload', {
348
+ method: 'POST',
349
+ body: formData
350
+ });
351
+
352
+ const result = await response.json();
353
+
354
+ if (response.ok) {
355
+ showMessage(result.message, 'success');
356
+ fileInput.value = '';
357
+ loadDocuments();
358
+ } else {
359
+ showMessage(result.detail || 'Upload failed', 'error');
360
+ }
361
+ } catch (error) {
362
+ showMessage('Network error: ' + error.message, 'error');
363
+ } finally {
364
+ isUploading = false;
365
+ setLoadingState(false, 'uploadBtn', 'Uploading...', 'Upload Document');
366
+ }
367
+ }
368
+
369
+ async function askQuestion() {
370
+ const questionInput = document.getElementById('questionInput');
371
+ const question = questionInput.value.trim();
372
+
373
+ if (!question) {
374
+ showMessage('Please enter a question', 'error');
375
+ return;
376
+ }
377
+
378
+ isAsking = true;
379
+ setLoadingState(true, 'askBtn', 'Processing...', 'Ask Question');
380
+
381
+ try {
382
+ const response = await fetch('/ask', {
383
+ method: 'POST',
384
+ headers: {
385
+ 'Content-Type': 'application/json',
386
+ },
387
+ body: JSON.stringify({ question: question })
388
+ });
389
+
390
+ const result = await response.json();
391
+
392
+ if (response.ok) {
393
+ displayAnswer(result.answer, result.sources);
394
+ } else {
395
+ showMessage(result.detail || 'Failed to get answer', 'error');
396
+ }
397
+ } catch (error) {
398
+ showMessage('Network error: ' + error.message, 'error');
399
+ } finally {
400
+ isAsking = false;
401
+ setLoadingState(false, 'askBtn', 'Processing...', 'Ask Question');
402
+ }
403
+ }
404
+
405
+ function displayAnswer(answer, sources) {
406
+ const answerSection = document.getElementById('answerSection');
407
+ const answerContent = document.getElementById('answerContent');
408
+ const sourcesContent = document.getElementById('sourcesContent');
409
+
410
+ answerContent.innerHTML = `<strong>Answer:</strong><br>${answer}`;
411
+
412
+ if (sources && sources.length > 0) {
413
+ let sourcesHtml = '<h4>Sources:</h4>';
414
+ sources.forEach((source, index) => {
415
+ sourcesHtml += `
416
+ <div class="source-item">
417
+ <strong>${index + 1}. ${source.document}</strong>
418
+ <br><small>Similarity: ${(source.similarity * 100).toFixed(1)}%</small>
419
+ </div>
420
+ `;
421
+ });
422
+ sourcesContent.innerHTML = sourcesHtml;
423
+ } else {
424
+ sourcesContent.innerHTML = '<h4>Sources:</h4><p>No sources found</p>';
425
+ }
426
+
427
+ answerSection.classList.remove('hidden');
428
+ }
429
+
430
+ async function loadDocuments() {
431
+ try {
432
+ const response = await fetch('/documents');
433
+ const result = await response.json();
434
+
435
+ const documentsList = document.getElementById('documentsList');
436
+
437
+ if (result.documents && result.documents.length > 0) {
438
+ let html = '';
439
+ result.documents.forEach(doc => {
440
+ html += `
441
+ <div class="document-item">
442
+ <div>
443
+ <div class="document-name">${doc.title}</div>
444
+ <div class="document-chunks">${doc.chunk_count} chunks</div>
445
+ </div>
446
+ </div>
447
+ `;
448
+ });
449
+ documentsList.innerHTML = html;
450
+ } else {
451
+ documentsList.innerHTML = '<div class="no-documents">No documents uploaded yet</div>';
452
+ }
453
+ } catch (error) {
454
+ console.error('Error loading documents:', error);
455
+ }
456
+ }
457
+
458
+ async function clearAllDocuments() {
459
+ if (!confirm('Are you sure you want to clear all documents? This action cannot be undone.')) {
460
+ return;
461
+ }
462
+
463
+ setLoadingState(true, 'clearBtn', 'Clearing...', 'Clear All Documents');
464
+
465
+ try {
466
+ const response = await fetch('/clear', {
467
+ method: 'DELETE'
468
+ });
469
+
470
+ const result = await response.json();
471
+
472
+ if (response.ok) {
473
+ showMessage(result.message, 'success');
474
+ loadDocuments();
475
+ document.getElementById('answerSection').classList.add('hidden');
476
+ } else {
477
+ showMessage(result.detail || 'Failed to clear documents', 'error');
478
+ }
479
+ } catch (error) {
480
+ showMessage('Network error: ' + error.message, 'error');
481
+ } finally {
482
+ setLoadingState(false, 'clearBtn', 'Clearing...', 'Clear All Documents');
483
+ }
484
+ }
485
+
486
+ document.getElementById('questionInput').addEventListener('keypress', function(e) {
487
+ if (e.key === 'Enter' && e.ctrlKey) {
488
+ askQuestion();
489
+ }
490
+ });
491
+
492
+ window.onload = function() {
493
+ loadDocuments();
494
+ };
495
+ </script>
496
+ </body>
497
+ </html> -->
498
+
499
+
500
+ <!-- perfect index.html -->
501
+ <!DOCTYPE html>
502
+ <html lang="en">
503
+ <head>
504
+ <meta charset="UTF-8">
505
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
506
+ <title>Scholar's Archive - Document Intelligence System</title>
507
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css" rel="stylesheet">
508
+ <style>
509
+ :root {
510
+ --primary-color: #1a365d;
511
+ --secondary-color: #2d5a87;
512
+ --accent-color: #c7955b;
513
+ --light-bg: #f8f6f0;
514
+ --cream: #faf8f2;
515
+ --text-dark: #2c3e50;
516
+ --text-muted: #718096;
517
+ --border-color: #e2d8cc;
518
+ --shadow: 0 4px 20px rgba(26, 54, 93, 0.1);
519
+ --shadow-hover: 0 8px 30px rgba(26, 54, 93, 0.15);
520
+ }
521
+
522
+ * {
523
+ margin: 0;
524
+ padding: 0;
525
+ box-sizing: border-box;
526
+ }
527
+
528
+ body {
529
+ font-family: 'Georgia', 'Times New Roman', serif;
530
+ line-height: 1.7;
531
+ background: linear-gradient(135deg, var(--light-bg) 0%, var(--cream) 100%);
532
+ color: var(--text-dark);
533
+ min-height: 100vh;
534
+ }
535
+
536
+ .container {
537
+ max-width: 1200px;
538
+ margin: 0 auto;
539
+ padding: 20px;
540
+ }
541
+
542
+ .header {
543
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
544
+ color: white;
545
+ text-align: center;
546
+ padding: 3rem 2rem;
547
+ border-radius: 15px 15px 0 0;
548
+ box-shadow: var(--shadow);
549
+ position: relative;
550
+ overflow: hidden;
551
+ }
552
+
553
+ .header::before {
554
+ content: '';
555
+ position: absolute;
556
+ top: 0;
557
+ left: 0;
558
+ right: 0;
559
+ bottom: 0;
560
+ background: url('data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><defs><pattern id="grain" patternUnits="userSpaceOnUse" width="100" height="100"><circle cx="20" cy="20" r="1" fill="rgba(255,255,255,0.05)"/><circle cx="80" cy="40" r="1" fill="rgba(255,255,255,0.03)"/><circle cx="40" cy="80" r="1" fill="rgba(255,255,255,0.04)"/></pattern></defs><rect width="100" height="100" fill="url(%23grain)"/></svg>');
561
+ }
562
+
563
+ .header-content {
564
+ position: relative;
565
+ z-index: 1;
566
+ }
567
+
568
+ .header h1 {
569
+ font-size: 2.8rem;
570
+ margin-bottom: 0.5rem;
571
+ font-weight: 300;
572
+ letter-spacing: 1px;
573
+ }
574
+
575
+ .header .subtitle {
576
+ font-size: 1.2rem;
577
+ opacity: 0.9;
578
+ font-style: italic;
579
+ margin-bottom: 1rem;
580
+ }
581
+
582
+ .header .description {
583
+ font-size: 1rem;
584
+ opacity: 0.8;
585
+ max-width: 600px;
586
+ margin: 0 auto;
587
+ }
588
+
589
+ .main-content {
590
+ background: white;
591
+ border-radius: 0 0 15px 15px;
592
+ box-shadow: var(--shadow);
593
+ overflow: hidden;
594
+ }
595
+
596
+ .section {
597
+ padding: 2.5rem;
598
+ border-bottom: 1px solid var(--border-color);
599
+ position: relative;
600
+ }
601
+
602
+ .section:last-child {
603
+ border-bottom: none;
604
+ }
605
+
606
+ .section-header {
607
+ display: flex;
608
+ align-items: center;
609
+ margin-bottom: 2rem;
610
+ padding-bottom: 1rem;
611
+ border-bottom: 2px solid var(--accent-color);
612
+ }
613
+
614
+ .section-header i {
615
+ font-size: 1.5rem;
616
+ color: var(--accent-color);
617
+ margin-right: 1rem;
618
+ }
619
+
620
+ .section-header h2 {
621
+ font-size: 1.8rem;
622
+ color: var(--primary-color);
623
+ font-weight: 400;
624
+ }
625
+
626
+ .upload-zone {
627
+ border: 2px dashed var(--border-color);
628
+ border-radius: 12px;
629
+ padding: 3rem 2rem;
630
+ text-align: center;
631
+ background: var(--cream);
632
+ transition: all 0.3s ease;
633
+ cursor: pointer;
634
+ position: relative;
635
+ overflow: hidden;
636
+ }
637
+
638
+ .upload-zone:hover {
639
+ border-color: var(--accent-color);
640
+ background: white;
641
+ transform: translateY(-2px);
642
+ box-shadow: var(--shadow-hover);
643
+ }
644
+
645
+ .upload-zone.dragover {
646
+ border-color: var(--primary-color);
647
+ background: rgba(26, 54, 93, 0.05);
648
+ }
649
+
650
+ .upload-icon {
651
+ font-size: 3rem;
652
+ color: var(--accent-color);
653
+ margin-bottom: 1rem;
654
+ }
655
+
656
+ .upload-text {
657
+ font-size: 1.1rem;
658
+ color: var(--text-muted);
659
+ margin-bottom: 1rem;
660
+ }
661
+
662
+ .file-types {
663
+ font-size: 0.9rem;
664
+ color: var(--text-muted);
665
+ font-style: italic;
666
+ }
667
+
668
+ .btn {
669
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
670
+ color: white;
671
+ border: none;
672
+ padding: 0.8rem 2rem;
673
+ border-radius: 8px;
674
+ font-family: inherit;
675
+ font-size: 1rem;
676
+ cursor: pointer;
677
+ transition: all 0.3s ease;
678
+ box-shadow: 0 2px 10px rgba(26, 54, 93, 0.2);
679
+ position: relative;
680
+ overflow: hidden;
681
+ }
682
+
683
+ .btn:hover {
684
+ transform: translateY(-2px);
685
+ box-shadow: 0 4px 20px rgba(26, 54, 93, 0.3);
686
+ }
687
+
688
+ .btn:active {
689
+ transform: translateY(0);
690
+ }
691
+
692
+ .btn-secondary {
693
+ background: linear-gradient(135deg, var(--accent-color) 0%, #d4a574 100%);
694
+ }
695
+
696
+ .btn-danger {
697
+ background: linear-gradient(135deg, #c53030 0%, #e53e3e 100%);
698
+ }
699
+
700
+ .btn:disabled {
701
+ background: #cbd5e0;
702
+ color: #a0aec0;
703
+ cursor: not-allowed;
704
+ transform: none;
705
+ box-shadow: none;
706
+ }
707
+
708
+ .input-group {
709
+ margin-bottom: 1.5rem;
710
+ }
711
+
712
+ .form-control {
713
+ width: 100%;
714
+ padding: 1rem;
715
+ border: 2px solid var(--border-color);
716
+ border-radius: 8px;
717
+ font-family: inherit;
718
+ font-size: 1rem;
719
+ transition: border-color 0.3s ease;
720
+ background: var(--cream);
721
+ }
722
+
723
+ .form-control:focus {
724
+ outline: none;
725
+ border-color: var(--primary-color);
726
+ background: white;
727
+ box-shadow: 0 0 0 3px rgba(26, 54, 93, 0.1);
728
+ }
729
+
730
+ .question-textarea {
731
+ min-height: 120px;
732
+ resize: vertical;
733
+ }
734
+
735
+ .documents-grid {
736
+ display: grid;
737
+ gap: 1rem;
738
+ margin-top: 1rem;
739
+ }
740
+
741
+ .document-card {
742
+ background: var(--cream);
743
+ border: 1px solid var(--border-color);
744
+ border-radius: 10px;
745
+ padding: 1.5rem;
746
+ transition: all 0.3s ease;
747
+ position: relative;
748
+ }
749
+
750
+ .document-card:hover {
751
+ background: white;
752
+ box-shadow: var(--shadow);
753
+ transform: translateY(-2px);
754
+ }
755
+
756
+ .document-header {
757
+ display: flex;
758
+ justify-content: space-between;
759
+ align-items: flex-start;
760
+ margin-bottom: 0.5rem;
761
+ }
762
+
763
+ .document-title {
764
+ font-weight: 600;
765
+ color: var(--primary-color);
766
+ font-size: 1.1rem;
767
+ }
768
+
769
+ .document-meta {
770
+ color: var(--text-muted);
771
+ font-size: 0.9rem;
772
+ }
773
+
774
+ .answer-container {
775
+ background: white;
776
+ border: 1px solid var(--border-color);
777
+ border-radius: 12px;
778
+ margin-top: 1.5rem;
779
+ overflow: hidden;
780
+ box-shadow: var(--shadow);
781
+ }
782
+
783
+ .answer-header {
784
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
785
+ color: white;
786
+ padding: 1rem 1.5rem;
787
+ font-weight: 500;
788
+ }
789
+
790
+ .answer-content {
791
+ padding: 2rem;
792
+ background: var(--cream);
793
+ }
794
+
795
+ .answer-text {
796
+ font-size: 1.1rem;
797
+ line-height: 1.8;
798
+ margin-bottom: 2rem;
799
+ }
800
+
801
+ .sources-section {
802
+ background: white;
803
+ border-top: 1px solid var(--border-color);
804
+ padding: 1.5rem;
805
+ }
806
+
807
+ .sources-title {
808
+ color: var(--primary-color);
809
+ font-size: 1.2rem;
810
+ margin-bottom: 1rem;
811
+ display: flex;
812
+ align-items: center;
813
+ }
814
+
815
+ .sources-title i {
816
+ margin-right: 0.5rem;
817
+ }
818
+
819
+ .source-item {
820
+ background: var(--cream);
821
+ border: 1px solid var(--border-color);
822
+ border-radius: 8px;
823
+ padding: 1rem;
824
+ margin-bottom: 0.8rem;
825
+ transition: all 0.3s ease;
826
+ }
827
+
828
+ .source-item:hover {
829
+ background: white;
830
+ box-shadow: 0 2px 10px rgba(26, 54, 93, 0.05);
831
+ }
832
+
833
+ .source-name {
834
+ font-weight: 600;
835
+ color: var(--primary-color);
836
+ margin-bottom: 0.3rem;
837
+ }
838
+
839
+ .source-similarity {
840
+ color: var(--text-muted);
841
+ font-size: 0.9rem;
842
+ }
843
+
844
+ .status-message {
845
+ padding: 1rem 1.5rem;
846
+ border-radius: 8px;
847
+ margin: 1rem 0;
848
+ font-weight: 500;
849
+ display: flex;
850
+ align-items: center;
851
+ }
852
+
853
+ .status-message i {
854
+ margin-right: 0.5rem;
855
+ }
856
+
857
+ .status-success {
858
+ background: #f0fff4;
859
+ color: #22543d;
860
+ border: 1px solid #9ae6b4;
861
+ }
862
+
863
+ .status-error {
864
+ background: #fed7d7;
865
+ color: #742a2a;
866
+ border: 1px solid #fc8181;
867
+ }
868
+
869
+ .loading-spinner {
870
+ display: inline-block;
871
+ width: 20px;
872
+ height: 20px;
873
+ border: 2px solid rgba(255, 255, 255, 0.3);
874
+ border-radius: 50%;
875
+ border-top-color: white;
876
+ animation: spin 0.8s linear infinite;
877
+ margin-right: 0.5rem;
878
+ }
879
+
880
+ @keyframes spin {
881
+ to { transform: rotate(360deg); }
882
+ }
883
+
884
+ .controls {
885
+ display: flex;
886
+ gap: 1rem;
887
+ align-items: center;
888
+ flex-wrap: wrap;
889
+ margin-top: 1.5rem;
890
+ }
891
+
892
+ .hidden {
893
+ display: none;
894
+ }
895
+
896
+ .empty-state {
897
+ text-align: center;
898
+ padding: 2rem;
899
+ color: var(--text-muted);
900
+ font-style: italic;
901
+ }
902
+
903
+ .empty-state i {
904
+ font-size: 3rem;
905
+ color: var(--accent-color);
906
+ margin-bottom: 1rem;
907
+ display: block;
908
+ }
909
+
910
+ .stats-bar {
911
+ background: var(--cream);
912
+ padding: 1rem 1.5rem;
913
+ border-radius: 8px;
914
+ display: flex;
915
+ justify-content: space-between;
916
+ align-items: center;
917
+ margin-bottom: 1.5rem;
918
+ border: 1px solid var(--border-color);
919
+ }
920
+
921
+ .stat-item {
922
+ text-align: center;
923
+ }
924
+
925
+ .stat-value {
926
+ font-size: 1.5rem;
927
+ font-weight: 600;
928
+ color: var(--primary-color);
929
+ }
930
+
931
+ .stat-label {
932
+ font-size: 0.9rem;
933
+ color: var(--text-muted);
934
+ }
935
+
936
+ @media (max-width: 768px) {
937
+ .container {
938
+ padding: 10px;
939
+ }
940
+
941
+ .section {
942
+ padding: 1.5rem;
943
+ }
944
+
945
+ .header h1 {
946
+ font-size: 2rem;
947
+ }
948
+
949
+ .controls {
950
+ flex-direction: column;
951
+ align-items: stretch;
952
+ }
953
+
954
+ .btn {
955
+ width: 100%;
956
+ }
957
+ }
958
+ </style>
959
+ </head>
960
+ <body>
961
+ <div class="container">
962
+ <div class="header">
963
+ <div class="header-content">
964
+ <h1><i class="fas fa-university"></i> Scholar's Archive</h1>
965
+ <p class="subtitle">Document Intelligence System</p>
966
+ <p class="description">A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology</p>
967
+ </div>
968
+ </div>
969
+
970
+ <div class="main-content">
971
+ <div class="section">
972
+ <div class="section-header">
973
+ <i class="fas fa-cloud-upload-alt"></i>
974
+ <h2>Document Repository</h2>
975
+ </div>
976
+
977
+ <div class="upload-zone" id="uploadZone">
978
+ <div class="upload-icon">
979
+ <i class="fas fa-file-upload"></i>
980
+ </div>
981
+ <div class="upload-text">
982
+ <strong>Drop your documents here</strong> or click to browse
983
+ </div>
984
+ <div class="file-types">
985
+ Supported formats: PDF, DOCX, TXT, CSV
986
+ </div>
987
+ <input type="file" id="fileInput" accept=".pdf,.docx,.txt,.csv" style="display: none;">
988
+ </div>
989
+
990
+ <div class="controls">
991
+ <button class="btn" onclick="uploadDocument()" id="uploadBtn">
992
+ <i class="fas fa-upload"></i> Upload Document
993
+ </button>
994
+ <button class="btn btn-danger" onclick="clearAllDocuments()" id="clearBtn">
995
+ <i class="fas fa-trash-alt"></i> Clear Repository
996
+ </button>
997
+ </div>
998
+
999
+ <div id="uploadStatus"></div>
1000
+ </div>
1001
+
1002
+ <div class="section">
1003
+ <div class="section-header">
1004
+ <i class="fas fa-books"></i>
1005
+ <h2>Document Collection</h2>
1006
+ </div>
1007
+
1008
+ <div class="stats-bar" id="statsBar">
1009
+ <div class="stat-item">
1010
+ <div class="stat-value" id="docCount">0</div>
1011
+ <div class="stat-label">Documents</div>
1012
+ </div>
1013
+ <div class="stat-item">
1014
+ <div class="stat-value" id="chunkCount">0</div>
1015
+ <div class="stat-label">Text Chunks</div>
1016
+ </div>
1017
+ <div class="stat-item">
1018
+ <div class="stat-value" id="pageCount">0</div>
1019
+ <div class="stat-label">Total Pages</div>
1020
+ </div>
1021
+ </div>
1022
+
1023
+ <div id="documentsList" class="documents-grid">
1024
+ <div class="empty-state">
1025
+ <i class="fas fa-folder-open"></i>
1026
+ <div>No documents in repository</div>
1027
+ </div>
1028
+ </div>
1029
+ </div>
1030
+
1031
+ <div class="section">
1032
+ <div class="section-header">
1033
+ <i class="fas fa-search"></i>
1034
+ <h2>Intelligent Inquiry</h2>
1035
+ </div>
1036
+
1037
+ <div class="input-group">
1038
+ <textarea
1039
+ id="questionInput"
1040
+ class="form-control question-textarea"
1041
+ placeholder="Enter your scholarly inquiry about the uploaded documents..."
1042
+ rows="4"
1043
+ ></textarea>
1044
+ </div>
1045
+
1046
+ <div class="controls">
1047
+ <button class="btn" onclick="askQuestion()" id="askBtn">
1048
+ <i class="fas fa-brain"></i> Submit Inquiry
1049
+ </button>
1050
+ <button class="btn btn-secondary" onclick="clearAnswer()" id="clearAnswerBtn">
1051
+ <i class="fas fa-eraser"></i> Clear Response
1052
+ </button>
1053
+ </div>
1054
+
1055
+ <div id="answerContainer" class="answer-container hidden">
1056
+ <div class="answer-header">
1057
+ <i class="fas fa-lightbulb"></i> Scholarly Response
1058
+ </div>
1059
+ <div class="answer-content">
1060
+ <div id="answerText" class="answer-text"></div>
1061
+ </div>
1062
+ <div id="sourcesSection" class="sources-section">
1063
+ <div class="sources-title">
1064
+ <i class="fas fa-quote-left"></i> Referenced Sources
1065
+ </div>
1066
+ <div id="sourcesList"></div>
1067
+ </div>
1068
+ </div>
1069
+ </div>
1070
+ </div>
1071
+ </div>
1072
+
1073
+ <script>
1074
+ let isUploading = false;
1075
+ let isAsking = false;
1076
+
1077
+
1078
+ document.addEventListener('DOMContentLoaded', function() {
1079
+ loadDocuments();
1080
+ setupEventListeners();
1081
+ });
1082
+
1083
+ function setupEventListeners() {
1084
+ const uploadZone = document.getElementById('uploadZone');
1085
+ const fileInput = document.getElementById('fileInput');
1086
+ const questionInput = document.getElementById('questionInput');
1087
+
1088
+
1089
+ uploadZone.addEventListener('click', () => fileInput.click());
1090
+
1091
+ uploadZone.addEventListener('dragover', (e) => {
1092
+ e.preventDefault();
1093
+ uploadZone.classList.add('dragover');
1094
+ });
1095
+
1096
+ uploadZone.addEventListener('dragleave', () => {
1097
+ uploadZone.classList.remove('dragover');
1098
+ });
1099
+
1100
+ uploadZone.addEventListener('drop', (e) => {
1101
+ e.preventDefault();
1102
+ uploadZone.classList.remove('dragover');
1103
+ const files = e.dataTransfer.files;
1104
+ if (files.length > 0) {
1105
+ fileInput.files = files;
1106
+ uploadDocument();
1107
+ }
1108
+ });
1109
+
1110
+
1111
+ fileInput.addEventListener('change', uploadDocument);
1112
+
1113
+ questionInput.addEventListener('keydown', (e) => {
1114
+ if (e.key === 'Enter' && (e.ctrlKey || e.metaKey)) {
1115
+ askQuestion();
1116
+ }
1117
+ });
1118
+ }
1119
+
1120
+ function showMessage(message, type, icon = null) {
1121
+ const statusDiv = document.getElementById('uploadStatus');
1122
+ const iconHtml = icon ? `<i class="fas fa-${icon}"></i>` : '';
1123
+ statusDiv.innerHTML = `<div class="status-message status-${type}">${iconHtml}${message}</div>`;
1124
+ setTimeout(() => {
1125
+ statusDiv.innerHTML = '';
1126
+ }, 5000);
1127
+ }
1128
+
1129
+ function setLoadingState(isLoading, buttonId, loadingText, normalText, normalIcon = null) {
1130
+ const button = document.getElementById(buttonId);
1131
+ if (isLoading) {
1132
+ button.innerHTML = `<span class="loading-spinner"></span>${loadingText}`;
1133
+ button.disabled = true;
1134
+ } else {
1135
+ const iconHtml = normalIcon ? `<i class="fas fa-${normalIcon}"></i> ` : '';
1136
+ button.innerHTML = `${iconHtml}${normalText}`;
1137
+ button.disabled = false;
1138
+ }
1139
+ }
1140
+
1141
+ async function uploadDocument() {
1142
+ const fileInput = document.getElementById('fileInput');
1143
+ const file = fileInput.files[0];
1144
+
1145
+ if (!file) return;
1146
+
1147
+ isUploading = true;
1148
+ setLoadingState(true, 'uploadBtn', 'Processing Document...', 'Upload Document', 'upload');
1149
+
1150
+ const formData = new FormData();
1151
+ formData.append('file', file);
1152
+
1153
+ try {
1154
+ const response = await fetch('/upload', {
1155
+ method: 'POST',
1156
+ body: formData
1157
+ });
1158
+
1159
+ const result = await response.json();
1160
+
1161
+ if (response.ok) {
1162
+ showMessage(result.message, 'success', 'check-circle');
1163
+ fileInput.value = '';
1164
+ await loadDocuments();
1165
+ } else {
1166
+ showMessage(result.detail || 'Upload failed', 'error', 'exclamation-triangle');
1167
+ }
1168
+ } catch (error) {
1169
+ showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
1170
+ } finally {
1171
+ isUploading = false;
1172
+ setLoadingState(false, 'uploadBtn', 'Processing Document...', 'Upload Document', 'upload');
1173
+ }
1174
+ }
1175
+
1176
+ async function askQuestion() {
1177
+ const questionInput = document.getElementById('questionInput');
1178
+ const question = questionInput.value.trim();
1179
+
1180
+ if (!question) {
1181
+ showMessage('Please enter a question', 'error', 'exclamation-triangle');
1182
+ return;
1183
+ }
1184
+
1185
+ isAsking = true;
1186
+ setLoadingState(true, 'askBtn', 'Analyzing Documents...', 'Submit Inquiry', 'brain');
1187
+
1188
+ try {
1189
+ const response = await fetch('/ask', {
1190
+ method: 'POST',
1191
+ headers: {
1192
+ 'Content-Type': 'application/json',
1193
+ },
1194
+ body: JSON.stringify({ question: question })
1195
+ });
1196
+
1197
+ const result = await response.json();
1198
+
1199
+ if (response.ok) {
1200
+ displayAnswer(result.answer, result.sources);
1201
+ } else {
1202
+ showMessage(result.detail || 'Failed to get answer', 'error', 'exclamation-triangle');
1203
+ }
1204
+ } catch (error) {
1205
+ showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
1206
+ } finally {
1207
+ isAsking = false;
1208
+ setLoadingState(false, 'askBtn', 'Analyzing Documents...', 'Submit Inquiry', 'brain');
1209
+ }
1210
+ }
1211
+
1212
+ function displayAnswer(answer, sources) {
1213
+ const answerContainer = document.getElementById('answerContainer');
1214
+ const answerText = document.getElementById('answerText');
1215
+ const sourcesList = document.getElementById('sourcesList');
1216
+
1217
+ answerText.innerHTML = answer;
1218
+
1219
+ if (sources && sources.length > 0) {
1220
+ let sourcesHtml = '';
1221
+ sources.forEach((source, index) => {
1222
+ const similarity = Math.round(source.similarity * 100);
1223
+ sourcesHtml += `
1224
+ <div class="source-item">
1225
+ <div class="source-name">
1226
+ <i class="fas fa-file-alt"></i> ${source.document}
1227
+ </div>
1228
+ <div class="source-similarity">
1229
+ Relevance: ${similarity}% • Chunk ${index + 1}
1230
+ </div>
1231
+ </div>
1232
+ `;
1233
+ });
1234
+ sourcesList.innerHTML = sourcesHtml;
1235
+ } else {
1236
+ sourcesList.innerHTML = `
1237
+ <div class="empty-state">
1238
+ <i class="fas fa-search"></i>
1239
+ <div>No specific sources referenced</div>
1240
+ </div>
1241
+ `;
1242
+ }
1243
+
1244
+ answerContainer.classList.remove('hidden');
1245
+ answerContainer.scrollIntoView({ behavior: 'smooth' });
1246
+ }
1247
+
1248
+ function clearAnswer() {
1249
+ const answerContainer = document.getElementById('answerContainer');
1250
+ answerContainer.classList.add('hidden');
1251
+ document.getElementById('questionInput').value = '';
1252
+ }
1253
+
1254
+ async function loadDocuments() {
1255
+ try {
1256
+ const [docsResponse, statsResponse] = await Promise.all([
1257
+ fetch('/documents'),
1258
+ fetch('/stats')
1259
+ ]);
1260
+
1261
+ const docsResult = await docsResponse.json();
1262
+ const statsResult = await statsResponse.json();
1263
+
1264
+ updateDocumentsList(docsResult.documents || []);
1265
+ updateStats(statsResult);
1266
+ } catch (error) {
1267
+ console.error('Error loading documents:', error);
1268
+ }
1269
+ }
1270
+
1271
+ function updateDocumentsList(documents) {
1272
+ const documentsList = document.getElementById('documentsList');
1273
+
1274
+ if (documents.length === 0) {
1275
+ documentsList.innerHTML = `
1276
+ <div class="empty-state">
1277
+ <i class="fas fa-folder-open"></i>
1278
+ <div>No documents in repository</div>
1279
+ </div>
1280
+ `;
1281
+ return;
1282
+ }
1283
+
1284
+ let html = '';
1285
+ documents.forEach(doc => {
1286
+ html += `
1287
+ <div class="document-card">
1288
+ <div class="document-header">
1289
+ <div class="document-title">
1290
+ <i class="fas fa-file-alt"></i> ${doc.title}
1291
+ </div>
1292
+ </div>
1293
+ <div class="document-meta">
1294
+ <i class="fas fa-layer-group"></i> ${doc.chunk_count} chunks
1295
+ ${doc.total_pages ? ` • <i class="fas fa-file-pdf"></i> ${doc.total_pages} pages` : ''}
1296
+ </div>
1297
+ </div>
1298
+ `;
1299
+ });
1300
+ documentsList.innerHTML = html;
1301
+ }
1302
+
1303
+ function updateStats(stats) {
1304
+ document.getElementById('docCount').textContent = stats.total_documents || 0;
1305
+ document.getElementById('chunkCount').textContent = stats.total_chunks || 0;
1306
+ document.getElementById('pageCount').textContent = stats.total_pages || 0;
1307
+ }
1308
+
1309
+ async function clearAllDocuments() {
1310
+ if (!confirm('Are you sure you want to clear all documents from the repository? This action cannot be undone.')) {
1311
+ return;
1312
+ }
1313
+
1314
+ setLoadingState(true, 'clearBtn', 'Clearing Repository...', 'Clear Repository', 'trash-alt');
1315
+
1316
+ try {
1317
+ const response = await fetch('/clear', {
1318
+ method: 'DELETE'
1319
+ });
1320
+
1321
+ const result = await response.json();
1322
+
1323
+ if (response.ok) {
1324
+ showMessage(result.message, 'success', 'check-circle');
1325
+ await loadDocuments();
1326
+ clearAnswer();
1327
+ } else {
1328
+ showMessage(result.detail || 'Failed to clear documents', 'error', 'exclamation-triangle');
1329
+ }
1330
+ } catch (error) {
1331
+ showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
1332
+ } finally {
1333
+ setLoadingState(false, 'clearBtn', 'Clearing Repository...', 'Clear Repository', 'trash-alt');
1334
+ }
1335
+ }
1336
+ </script>
1337
+ </body>
1338
+ </html>