Spaces:

JigneshPrajapati18
/

RAG_QA

Sleeping

App Files Files Community

JigneshPrajapati18 commited on Jul 1, 2025

Commit

3ee59ee

verified ·

1 Parent(s): 77dc82d

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -0
Dockerfile +22 -0
RAG.py +1285 -0
README.md +48 -11
__pycache__/RAG.cpython-312.pyc +0 -0
__pycache__/app.cpython-312.pyc +0 -0
__pycache__/rag_system.cpython-312.pyc +0 -0
app.py +1379 -0
rag_storage/metadata.pkl +3 -0
rag_storage/vector_store.faiss +3 -0
requirements.txt +13 -0
templates/index.html +1338 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+rag_storage/vector_store.faiss filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.9-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+RUN mkdir -p /app/templates /app/static
+RUN mkdir -p /app/uploads /app/documents
+EXPOSE 7860
+CMD ["python", "app.py"]

RAG.py ADDED Viewed

	@@ -0,0 +1,1285 @@

+# import os
+# import re
+# import fitz
+# import nltk
+# import numpy as np
+# import pandas as pd
+# from typing import List, Dict, Tuple, Any, Optional
+# from sentence_transformers import SentenceTransformer
+# from nltk.tokenize import sent_tokenize
+# import logging
+# import json
+# from sklearn.metrics.pairwise import cosine_similarity
+# import torch
+# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, logging as hf_logging
+# from pathlib import Path
+# import faiss
+# from unstructured.partition.auto import partition
+# import tempfile
+# import pickle
+# import shutil
+# hf_logging.set_verbosity_error()
+# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# logger = logging.getLogger(__name__)
+# EMBEDDING_MODEL_NAME = 'all-MiniLM-L12-v2'
+# GENERATIVE_MODEL_NAME = "microsoft/phi-2"
+# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+# PHI_MAX_NEW_TOKENS = 250
+# PHI_TEMPERATURE = 0.3
+# QUERY_SIMILARITY_THRESHOLD = 0.50
+# CHUNK_SIZE = 100
+# CHUNK_OVERLAP = 30
+# STORAGE_DIR = "rag_storage"
+# try:
+#     nltk.download('punkt', quiet=True)
+#     logger.info("NLTK punkt found or downloaded successfully")
+# except Exception as e:
+#     logger.warning(f"Failed to download or find NLTK punkt: {e}. Using fallback tokenization.")
+#     def simple_sent_tokenize(text):
+#         sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
+#         return [s for s in sentences if s.strip()]
+#     sent_tokenize = simple_sent_tokenize
+# class DocumentProcessor:
+#     def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME, device: str = DEVICE):
+#         try:
+#             self.embedding_model = SentenceTransformer(embedding_model_name, device=device)
+#             logger.info(f"Initialized embedding model: {embedding_model_name} on device: {device}")
+#             self.device = device
+#             self.vector_store = None
+#             self.chunks = []
+#             self.doc_metadata = []
+#             self.storage_dir = STORAGE_DIR
+#             os.makedirs(self.storage_dir, exist_ok=True)
+#         except Exception as e:
+#             logger.error(f"Failed to load embedding model {embedding_model_name}: {e}")
+#             raise
+#     def save_state(self):
+#         """Save the current state to disk"""
+#         try:
+#             # Save FAISS index if it exists
+#             if self.vector_store is not None:
+#                 faiss.write_index(self.vector_store, os.path.join(self.storage_dir, "vector_store.faiss"))
+#             # Save chunks and metadata
+#             state = {
+#                 "chunks": self.chunks,
+#                 "doc_metadata": self.doc_metadata
+#             }
+#             with open(os.path.join(self.storage_dir, "metadata.pkl"), "wb") as f:
+#                 pickle.dump(state, f)
+#             logger.info("Successfully saved document processor state")
+#             return True
+#         except Exception as e:
+#             logger.error(f"Failed to save state: {e}")
+#             return False
+#     def load_state(self) -> bool:
+#         """Load state from disk if available"""
+#         try:
+#             faiss_path = os.path.join(self.storage_dir, "vector_store.faiss")
+#             metadata_path = os.path.join(self.storage_dir, "metadata.pkl")
+#             if os.path.exists(faiss_path) and os.path.exists(metadata_path):
+#                 # Load FAISS index
+#                 self.vector_store = faiss.read_index(faiss_path)
+#                 # Load metadata and chunks
+#                 with open(metadata_path, "rb") as f:
+#                     state = pickle.load(f)
+#                     self.chunks = state["chunks"]
+#                     self.doc_metadata = state["doc_metadata"]
+#                 logger.info(f"Successfully loaded state with {len(self.chunks)} chunks and {len(self.doc_metadata)} documents")
+#                 return True
+#             else:
+#                 logger.info("No saved state found - starting fresh")
+#                 return False
+#         except Exception as e:
+#             logger.error(f"Failed to load state: {e}")
+#             return False
+#     def clear_state(self) -> bool:
+#         """Clear all stored data"""
+#         try:
+#             if os.path.exists(self.storage_dir):
+#                 shutil.rmtree(self.storage_dir)
+#                 os.makedirs(self.storage_dir, exist_ok=True)
+#             self.vector_store = None
+#             self.chunks = []
+#             self.doc_metadata = []
+#             logger.info("Successfully cleared all stored data")
+#             return True
+#         except Exception as e:
+#             logger.error(f"Failed to clear state: {e}")
+#             return False
+#     def _process_file(self, file_path: str) -> Tuple[str, str]:
+#         """Process different file types and extract text"""
+#         try:
+#             # Try unstructured first
+#             try:
+#                 elements = partition(filename=file_path)
+#                 text = "\n\n".join([str(el) for el in elements])
+#                 title = Path(file_path).stem
+#                 return text, title
+#             except ImportError:
+#                 # Fallback to PyMuPDF for PDFs
+#                 if file_path.lower().endswith('.pdf'):
+#                     doc = fitz.open(file_path)
+#                     text = ""
+#                     for page in doc:
+#                         text += page.get_text() + "\n\n"
+#                     doc.close()
+#                     title = Path(file_path).stem
+#                     return text, title
+#                 else:
+#                     raise
+#         except Exception as e:
+#             logger.error(f"Error processing file {file_path}: {e}")
+#             return "", Path(file_path).stem
+#     def chunk_text(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
+#         """Split text into chunks with overlap using sentence boundaries"""
+#         if not text:
+#             return []
+#         try:
+#             sentences = sent_tokenize(text)
+#         except Exception as e:
+#             logger.error(f"Sentence tokenization failed: {e}. Using simple split.")
+#             sentences = re.split(r'[\n\.\?\!]+', text)
+#             sentences = [s.strip() for s in sentences if s.strip()]
+#         if not sentences:
+#             logger.warning("No sentences found after tokenization.")
+#             return [text] if len(text) <= chunk_size else [text[i:i+chunk_size] for i in range(0, len(text), chunk_size-overlap)]
+#         chunks = []
+#         current_chunk = []
+#         current_length = 0
+#         for sentence in sentences:
+#             sentence_len = len(sentence)
+#             if current_length + sentence_len > chunk_size:
+#                 if current_chunk:
+#                     chunks.append(" ".join(current_chunk))
+#                 current_chunk = current_chunk[-max(1, len(current_chunk)*overlap//chunk_size):]  # Keep overlap
+#                 current_length = sum(len(s) for s in current_chunk)
+#                 if sentence_len <= chunk_size:
+#                     current_chunk.append(sentence)
+#                     current_length += sentence_len
+#                 else:
+#                     logger.warning(f"Sentence length ({sentence_len}) exceeds chunk size ({chunk_size}). Adding as its own chunk.")
+#                     chunks.append(sentence)
+#             else:
+#                 current_chunk.append(sentence)
+#                 current_length += sentence_len
+#         if current_chunk:
+#             chunks.append(" ".join(current_chunk))
+#         chunks = [c for c in chunks if c.strip()]
+#         logger.info(f"Split text into {len(chunks)} chunks.")
+#         return chunks
+#     def generate_embedding(self, text: str) -> Optional[np.ndarray]:
+#         """Generate embedding for a single text chunk"""
+#         if not text or not isinstance(text, str):
+#             logger.warning("generate_embedding called with invalid text.")
+#             return None
+#         try:
+#             self.embedding_model.to(self.device)
+#             embedding = self.embedding_model.encode(text, convert_to_numpy=True, show_progress_bar=False)
+#             return embedding.astype(np.float32)
+#         except Exception as e:
+#             logger.error(f"Error generating embedding: {e}")
+#             return None
+#     def add_document(self, file_path: str) -> bool:
+#         """Process and add a document to the vector store"""
+#         logger.info(f"Processing document: {file_path}")
+#         try:
+#             # Check if document already exists
+#             for doc in self.doc_metadata:
+#                 if os.path.normpath(doc["path"]) == os.path.normpath(file_path):
+#                     logger.info(f"Document '{doc['title']}' already exists in the index - skipping")
+#                     return True
+#             text, title = self._process_file(file_path)
+#             if not text:
+#                 logger.warning(f"No text extracted from {file_path}")
+#                 return False
+#             chunks = self.chunk_text(text)
+#             if not chunks:
+#                 logger.warning(f"No chunks created for {file_path}")
+#                 return False
+#             # Generate embeddings for all chunks
+#             embeddings = []
+#             valid_chunks = []
+#             for i, chunk in enumerate(chunks):
+#                 emb = self.generate_embedding(chunk)
+#                 if emb is not None:
+#                     embeddings.append(emb)
+#                     valid_chunks.append({
+#                         "text": chunk,
+#                         "doc_title": title,
+#                         "doc_path": file_path,
+#                         "chunk_index": i
+#                     })
+#             if not embeddings:
+#                 logger.warning(f"No valid embeddings generated for {file_path}")
+#                 return False
+#             embeddings = np.array(embeddings)
+#             # Initialize or update FAISS index
+#             if self.vector_store is None:
+#                 self.vector_store = faiss.IndexFlatL2(embeddings.shape[1])
+#                 self.vector_store.add(embeddings)
+#             else:
+#                 self.vector_store.add(embeddings)
+#             # Store metadata
+#             start_idx = len(self.chunks)
+#             self.chunks.extend(valid_chunks)
+#             self.doc_metadata.append({
+#                 "title": title,
+#                 "path": file_path,
+#                 "chunk_count": len(valid_chunks),
+#                 "start_idx": start_idx,
+#                 "end_idx": start_idx + len(valid_chunks) - 1
+#             })
+#             # Save state after each document addition
+#             self.save_state()
+#             logger.info(f"Successfully added document '{title}' with {len(valid_chunks)} chunks")
+#             return True
+#         except Exception as e:
+#             logger.error(f"Failed to process document {file_path}: {e}")
+#             return False
+#     def search_chunks(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+#         """Search for relevant chunks using semantic similarity"""
+#         if self.vector_store is None or not self.chunks:
+#             logger.warning("No documents have been indexed yet")
+#             return []
+#         query_embedding = self.generate_embedding(query)
+#         if query_embedding is None:
+#             logger.error("Failed to generate embedding for the query")
+#             return []
+#         query_embedding = np.array([query_embedding])  # Convert to 2D array
+#         # Search FAISS index
+#         distances, indices = self.vector_store.search(query_embedding, top_k)
+#         # Convert to similarity scores (FAISS returns squared L2 distances)
+#         similarities = 1 / (1 + distances[0])
+#         results = []
+#         for idx, sim in zip(indices[0], similarities):
+#             if idx < 0 or idx >= len(self.chunks):  # Invalid index
+#                 continue
+#             chunk_data = self.chunks[idx]
+#             results.append({
+#                 "text": chunk_data["text"],
+#                 "similarity": float(sim),
+#                 "doc_title": chunk_data["doc_title"],
+#                 "doc_path": chunk_data["doc_path"],
+#                 "chunk_index": chunk_data["chunk_index"]
+#             })
+#         # Sort by similarity (highest first)
+#         results.sort(key=lambda x: x["similarity"], reverse=True)
+#         # Apply threshold
+#         results = [r for r in results if r["similarity"] >= QUERY_SIMILARITY_THRESHOLD]
+#         if not results and top_k > 0:
+#             logger.info("No chunks met similarity threshold, returning top result anyway")
+#             return results[:1]
+#         return results
+# class RAGSystem:
+#     def __init__(self):
+#         logger.info("Initializing RAG System...")
+#         try:
+#             self.doc_processor = DocumentProcessor(embedding_model_name=EMBEDDING_MODEL_NAME, device=DEVICE)
+#             # Try to load existing state
+#             if self.doc_processor.load_state():
+#                 logger.info("Successfully loaded existing document index")
+#             else:
+#                 logger.info("Starting with a fresh document index")
+#             logger.info(f"Loading Generative LLM: {GENERATIVE_MODEL_NAME} on {DEVICE}...")
+#             try:
+#                 phi_tokenizer = AutoTokenizer.from_pretrained(GENERATIVE_MODEL_NAME, trust_remote_code=True)
+#                 model_kwargs = {"trust_remote_code": True}
+#                 if DEVICE == 'cuda':
+#                     if torch.cuda.is_bf16_supported():
+#                         logger.info("Using bfloat16 for Phi-2 model.")
+#                         model_kwargs["torch_dtype"] = torch.bfloat16
+#                     else:
+#                         logger.info("Using float16 for Phi-2 model.")
+#                         model_kwargs["torch_dtype"] = torch.float16
+#                 else:
+#                     logger.info("Using float32 for Phi-2 model on CPU.")
+#                     model_kwargs["torch_dtype"] = torch.float32
+#                 phi_model = AutoModelForCausalLM.from_pretrained(GENERATIVE_MODEL_NAME, **model_kwargs)
+#                 phi_model = phi_model.to(DEVICE)
+#                 pipeline_device_index = 0 if DEVICE == "cuda" else -1
+#                 self.phi_pipe = pipeline(
+#                     "text-generation",
+#                     model=phi_model,
+#                     tokenizer=phi_tokenizer,
+#                     device=pipeline_device_index
+#                 )
+#                 logger.info(f"✅ Generative LLM ({GENERATIVE_MODEL_NAME}) loaded successfully on {DEVICE}.")
+#             except Exception as e:
+#                 logger.error(f"❌ Critical Error loading Phi-2 model: {e}")
+#                 logger.error("RAG Q&A functionality will be disabled.")
+#                 self.phi_pipe = None
+#             logger.info("✅ RAG System initialized successfully.")
+#         except Exception as e:
+#             logger.critical(f"Failed to initialize RAG System: {e}", exc_info=True)
+#             raise RuntimeError("System initialization failed.") from e
+#     def add_document(self, file_path: str) -> bool:
+#         """Add a document to the system"""
+#         return self.doc_processor.add_document(file_path)
+#     def ask_question(self, question: str, top_k: int = 3) -> Dict[str, Any]:
+#         """Answer a question using RAG"""
+#         if self.phi_pipe is None:
+#             return {
+#                 "answer": "Error: The AI model is not available. Please check the logs.",
+#                 "sources": []
+#             }
+#         logger.info(f"Processing question: '{question[:100]}...'")
+#         # Step 1: Retrieve relevant chunks
+#         relevant_chunks = self.doc_processor.search_chunks(question, top_k)
+#         if not relevant_chunks:
+#             return {
+#                 "answer": "No relevant information found in documents to answer this question.",
+#                 "sources": []
+#             }
+#         # Step 2: Prepare context for generation
+#         context = "\n\n---\n\n".join([
+#             f"Document: {chunk['doc_title']}\nChunk {chunk['chunk_index']} (Similarity: {chunk['similarity']:.2f})\n\n{chunk['text']}"
+#             for chunk in relevant_chunks
+#         ])
+#         # Step 3: Generate answer with Phi-2
+#         prompt = f"""You are a helpful assistant.Answer the question ONLY from the provided context.If the context is insufficient, just say you don't know.
+# Context:
+# {context}
+# Question: {question}
+# Answer: """
+#         try:
+#             output = self.phi_pipe(
+#                 prompt,
+#                 max_new_tokens=PHI_MAX_NEW_TOKENS,
+#                 temperature=PHI_TEMPERATURE,
+#                 do_sample=True,
+#                 return_full_text=False,
+#                 pad_token_id=self.phi_pipe.tokenizer.eos_token_id
+#             )
+#             generated_text = output[0]["generated_text"].strip()
+#             # Post-processing to clean up the response
+#             if "Question:" in generated_text:
+#                 generated_text = generated_text.split("Question:")[0].strip()
+#             # Extract sources
+#             sources = []
+#             seen_docs = set()
+#             for chunk in relevant_chunks:
+#                 if chunk['doc_title'] not in seen_docs:
+#                     sources.append({
+#                         "document": chunk['doc_title'],
+#                         "path": chunk['doc_path'],
+#                         "similarity": chunk['similarity']
+#                     })
+#                     seen_docs.add(chunk['doc_title'])
+#             return {
+#                 "answer": generated_text,
+#                 "sources": sources,
+#                 "relevant_chunks": relevant_chunks  # For debugging/explanation
+#             }
+#         except Exception as e:
+#             logger.error(f"Error generating answer: {e}")
+#             return {
+#                 "answer": f"Error generating answer: {str(e)}",
+#                 "sources": []
+#             }
+#     def explain_retrieval(self, question: str):
+#         """Explain the retrieval process for educational purposes"""
+#         print("\n=== RAG Process Explanation ===")
+#         print(f"Question: {question}")
+#         # Step 1: Show query embedding
+#         print("\n1. Query Embedding:")
+#         query_embedding = self.doc_processor.generate_embedding(question)
+#         if query_embedding is not None:
+#             print(f"- Generated {len(query_embedding)}-dimensional embedding vector")
+#             print(f"- Sample values: {query_embedding[:5]}...")
+#         else:
+#             print("Failed to generate query embedding")
+#             return
+#         # Step 2: Show retrieval
+#         print("\n2. Document Chunk Retrieval:")
+#         chunks = self.doc_processor.search_chunks(question, top_k=3)
+#         if not chunks:
+#             print("No relevant chunks found")
+#             return
+#         print(f"Found {len(chunks)} relevant chunks:")
+#         for i, chunk in enumerate(chunks, 1):
+#             print(f"\nChunk {i}:")
+#             print(f"- Source: {chunk['doc_title']}")
+#             print(f"- Chunk Index: {chunk['chunk_index']}")
+#             print(f"- Similarity Score: {chunk['similarity']:.4f}")
+#             print(f"- Text Preview: {chunk['text'][:150]}...")
+#         # Step 3: Show context preparation
+#         print("\n3. Context Preparation:")
+#         print("The top chunks are combined into a context that will be sent to the LLM")
+#         # Step 4: Show generation
+#         print("\n4. Generation with Phi-2:")
+#         print("The LLM is prompted to answer the question using ONLY the provided context")
+#         print("This helps prevent hallucination by grounding the response in the retrieved documents")
+#         # Show actual answer
+#         result = self.ask_question(question)
+#         print("\nFinal Answer:")
+#         print(result['answer'])
+#         print("\nSources:")
+#         for source in result['sources']:
+#             print(f"- {source['document']} (similarity: {source['similarity']:.2f})")
+#     def list_documents(self) -> List[Dict[str, Any]]:
+#         """List all indexed documents"""
+#         return [{
+#             "title": doc["title"],
+#             "path": doc["path"],
+#             "chunk_count": doc["chunk_count"]
+#         } for doc in self.doc_processor.doc_metadata]
+#     def clear_index(self) -> bool:
+#         """Clear all indexed documents"""
+#         return self.doc_processor.clear_state()
+#     def close(self):
+#         """Clean up resources"""
+#         logger.info("Shutting down RAG System...")
+#         # Save state before closing
+#         self.doc_processor.save_state()
+#         if hasattr(self, 'phi_pipe') and self.phi_pipe:
+#             del self.phi_pipe
+#         if hasattr(self.doc_processor, 'embedding_model'):
+#             del self.doc_processor.embedding_model
+#         if DEVICE == 'cuda':
+#             torch.cuda.empty_cache()
+#             logger.info("Cleared CUDA cache.")
+#         logger.info("RAG System shut down.")
+# def main():
+#     rag_system = RAGSystem()
+#     while True:
+#         print("\n1. Add Document")
+#         print("2. Ask Question")
+#         print("3. Explain Retrieval Process")
+#         print("4. List Indexed Documents")
+#         print("5. Clear All Documents")
+#         print("6. Exit")
+#         choice = input("Enter your choice: ")
+#         if choice == "1":
+#             file_path = input("Enter document path (CSV, DOCX, PDF, etc.): ").strip('"')
+#             if not os.path.exists(file_path):
+#                 print("File not found!")
+#                 continue
+#             if rag_system.add_document(file_path):
+#                 print("Document added successfully!")
+#             else:
+#                 print("Failed to add document")
+#         elif choice == "2":
+#             question = input("Enter your question: ")
+#             result = rag_system.ask_question(question)
+#             print("\nAnswer:", result["answer"])
+#             if result["sources"]:
+#                 print("\nSources:")
+#                 for src in result["sources"]:
+#                     print(f"- {src['document']} (similarity: {src['similarity']:.2f})")
+#             else:
+#                 print("(No sources cited)")
+#         elif choice == "3":
+#             question = input("Enter a question to explain the retrieval process: ")
+#             rag_system.explain_retrieval(question)
+#         elif choice == "4":
+#             docs = rag_system.list_documents()
+#             if docs:
+#                 print("\nIndexed Documents:")
+#                 for i, doc in enumerate(docs, 1):
+#                     print(f"{i}. {doc['title']} ({doc['chunk_count']} chunks)")
+#                     print(f"   Path: {doc['path']}")
+#             else:
+#                 print("No documents indexed yet")
+#         elif choice == "5":
+#             confirm = input("Are you sure you want to clear ALL documents? (y/n): ")
+#             if confirm.lower() == 'y':
+#                 if rag_system.clear_index():
+#                     print("All documents cleared")
+#                 else:
+#                     print("Failed to clear documents")
+#         elif choice == "6":
+#             rag_system.close()
+#             break
+#         else:
+#             print("Invalid choice")
+# if __name__ == "__main__":
+#     main()
+import os
+import re
+import fitz
+import nltk
+import numpy as np
+import pandas as pd
+from typing import List, Dict, Tuple, Any, Optional
+from sentence_transformers import SentenceTransformer
+from nltk.tokenize import sent_tokenize
+import logging
+import json
+from sklearn.metrics.pairwise import cosine_similarity
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, logging as hf_logging
+from pathlib import Path
+import faiss
+from unstructured.partition.auto import partition
+import tempfile
+import pickle
+import shutil
+hf_logging.set_verbosity_error()
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+EMBEDDING_MODEL_NAME = 'all-MiniLM-L12-v2'
+GENERATIVE_MODEL_NAME = "microsoft/phi-2"
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+PHI_MAX_NEW_TOKENS = 250
+PHI_TEMPERATURE = 0.3
+QUERY_SIMILARITY_THRESHOLD = 0.50
+CHUNK_SIZE = 100
+CHUNK_OVERLAP = 30
+STORAGE_DIR = "rag_storage"
+try:
+    nltk.download('punkt', quiet=True)
+    logger.info("NLTK punkt found or downloaded successfully")
+except Exception as e:
+    logger.warning(f"Failed to download or find NLTK punkt: {e}. Using fallback tokenization.")
+    def simple_sent_tokenize(text):
+        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
+        return [s for s in sentences if s.strip()]
+    sent_tokenize = simple_sent_tokenize
+class DocumentProcessor:
+    def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME, device: str = DEVICE):
+        try:
+            self.embedding_model = SentenceTransformer(embedding_model_name, device=device)
+            logger.info(f"Initialized embedding model: {embedding_model_name} on device: {device}")
+            self.device = device
+            self.vector_store = None
+            self.chunks = []
+            self.doc_metadata = []
+            self.storage_dir = STORAGE_DIR
+            os.makedirs(self.storage_dir, exist_ok=True)
+        except Exception as e:
+            logger.error(f"Failed to load embedding model {embedding_model_name}: {e}")
+            raise
+    def save_state(self):
+        """Save the current state to disk"""
+        try:
+            # Save FAISS index if it exists
+            if self.vector_store is not None:
+                faiss.write_index(self.vector_store, os.path.join(self.storage_dir, "vector_store.faiss"))
+            # Save chunks and metadata
+            state = {
+                "chunks": self.chunks,
+                "doc_metadata": self.doc_metadata
+            }
+            with open(os.path.join(self.storage_dir, "metadata.pkl"), "wb") as f:
+                pickle.dump(state, f)
+            logger.info("Successfully saved document processor state")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to save state: {e}")
+            return False
+    def load_state(self) -> bool:
+        """Load state from disk if available"""
+        try:
+            faiss_path = os.path.join(self.storage_dir, "vector_store.faiss")
+            metadata_path = os.path.join(self.storage_dir, "metadata.pkl")
+            if os.path.exists(faiss_path) and os.path.exists(metadata_path):
+                # Load FAISS index
+                self.vector_store = faiss.read_index(faiss_path)
+                # Load metadata and chunks
+                with open(metadata_path, "rb") as f:
+                    state = pickle.load(f)
+                    self.chunks = state["chunks"]
+                    self.doc_metadata = state["doc_metadata"]
+                logger.info(f"Successfully loaded state with {len(self.chunks)} chunks and {len(self.doc_metadata)} documents")
+                return True
+            else:
+                logger.info("No saved state found - starting fresh")
+                return False
+        except Exception as e:
+            logger.error(f"Failed to load state: {e}")
+            return False
+    def clear_state(self) -> bool:
+        """Clear all stored data"""
+        try:
+            if os.path.exists(self.storage_dir):
+                shutil.rmtree(self.storage_dir)
+                os.makedirs(self.storage_dir, exist_ok=True)
+            self.vector_store = None
+            self.chunks = []
+            self.doc_metadata = []
+            logger.info("Successfully cleared all stored data")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to clear state: {e}")
+            return False
+    def _extract_pdf_pages(self, file_path: str) -> List[Dict[str, Any]]:
+        """Extract text from PDF with page numbers"""
+        pages = []
+        try:
+            doc = fitz.open(file_path)
+            for page_num in range(len(doc)):
+                page = doc.load_page(page_num)
+                text = page.get_text()
+                if text.strip():  # Only include pages with content
+                    pages.append({
+                        "page_number": page_num + 1,
+                        "text": text
+                    })
+            doc.close()
+            logger.info(f"Extracted {len(pages)} pages from PDF")
+            return pages
+        except Exception as e:
+            logger.error(f"Error extracting PDF pages: {e}")
+            return []
+    def _process_file(self, file_path: str) -> Tuple[str, str, List[Dict[str, Any]]]:
+        """Process different file types and extract text with page information"""
+        try:
+            title = Path(file_path).stem
+            pages = []
+            # Handle PDF files specially to extract page numbers
+            if file_path.lower().endswith('.pdf'):
+                pages = self._extract_pdf_pages(file_path)
+                text = "\n\n".join([page["text"] for page in pages])
+                return text, title, pages
+            else:
+                # For non-PDF files, try unstructured first
+                try:
+                    elements = partition(filename=file_path)
+                    text = "\n\n".join([str(el) for el in elements])
+                    # For non-PDF files, create a single "page"
+                    pages = [{"page_number": 1, "text": text}]
+                    return text, title, pages
+                except ImportError:
+                    # Fallback for text files
+                    if file_path.lower().endswith(('.txt', '.csv')):
+                        with open(file_path, 'r', encoding='utf-8') as f:
+                            text = f.read()
+                        pages = [{"page_number": 1, "text": text}]
+                        return text, title, pages
+                    else:
+                        raise
+        except Exception as e:
+            logger.error(f"Error processing file {file_path}: {e}")
+            return "", Path(file_path).stem, []
+    def _find_chunk_page(self, chunk_text: str, pages: List[Dict[str, Any]]) -> int:
+        """Find which page a chunk belongs to"""
+        chunk_words = set(chunk_text.lower().split()[:10])  # Use first 10 words for matching
+        best_page = 1
+        best_score = 0
+        for page in pages:
+            page_words = set(page["text"].lower().split())
+            common_words = chunk_words.intersection(page_words)
+            score = len(common_words) / len(chunk_words) if chunk_words else 0
+            if score > best_score:
+                best_score = score
+                best_page = page["page_number"]
+        return best_page
+    def chunk_text(self, text: str, pages: List[Dict[str, Any]], chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Dict[str, Any]]:
+        """Split text into chunks with overlap using sentence boundaries and track page numbers"""
+        if not text:
+            return []
+        try:
+            sentences = sent_tokenize(text)
+        except Exception as e:
+            logger.error(f"Sentence tokenization failed: {e}. Using simple split.")
+            sentences = re.split(r'[\n\.\?\!]+', text)
+            sentences = [s.strip() for s in sentences if s.strip()]
+        if not sentences:
+            logger.warning("No sentences found after tokenization.")
+            return [{"text": text, "page_number": 1}] if len(text) <= chunk_size else [{"text": text[i:i+chunk_size], "page_number": 1} for i in range(0, len(text), chunk_size-overlap)]
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for sentence in sentences:
+            sentence_len = len(sentence)
+            if current_length + sentence_len > chunk_size:
+                if current_chunk:
+                    chunk_text = " ".join(current_chunk)
+                    page_number = self._find_chunk_page(chunk_text, pages)
+                    chunks.append({
+                        "text": chunk_text,
+                        "page_number": page_number
+                    })
+                current_chunk = current_chunk[-max(1, len(current_chunk)*overlap//chunk_size):]  # Keep overlap
+                current_length = sum(len(s) for s in current_chunk)
+                if sentence_len <= chunk_size:
+                    current_chunk.append(sentence)
+                    current_length += sentence_len
+                else:
+                    logger.warning(f"Sentence length ({sentence_len}) exceeds chunk size ({chunk_size}). Adding as its own chunk.")
+                    page_number = self._find_chunk_page(sentence, pages)
+                    chunks.append({
+                        "text": sentence,
+                        "page_number": page_number
+                    })
+            else:
+                current_chunk.append(sentence)
+                current_length += sentence_len
+        if current_chunk:
+            chunk_text = " ".join(current_chunk)
+            page_number = self._find_chunk_page(chunk_text, pages)
+            chunks.append({
+                "text": chunk_text,
+                "page_number": page_number
+            })
+        chunks = [c for c in chunks if c["text"].strip()]
+        logger.info(f"Split text into {len(chunks)} chunks with page numbers.")
+        return chunks
+    def generate_embedding(self, text: str) -> Optional[np.ndarray]:
+        """Generate embedding for a single text chunk"""
+        if not text or not isinstance(text, str):
+            logger.warning("generate_embedding called with invalid text.")
+            return None
+        try:
+            self.embedding_model.to(self.device)
+            embedding = self.embedding_model.encode(text, convert_to_numpy=True, show_progress_bar=False)
+            return embedding.astype(np.float32)
+        except Exception as e:
+            logger.error(f"Error generating embedding: {e}")
+            return None
+    def add_document(self, file_path: str) -> bool:
+        """Process and add a document to the vector store"""
+        logger.info(f"Processing document: {file_path}")
+        try:
+            # Check if document already exists
+            for doc in self.doc_metadata:
+                if os.path.normpath(doc["path"]) == os.path.normpath(file_path):
+                    logger.info(f"Document '{doc['title']}' already exists in the index - skipping")
+                    return True
+            text, title, pages = self._process_file(file_path)
+            if not text:
+                logger.warning(f"No text extracted from {file_path}")
+                return False
+            chunks = self.chunk_text(text, pages)
+            if not chunks:
+                logger.warning(f"No chunks created for {file_path}")
+                return False
+            # Generate embeddings for all chunks
+            embeddings = []
+            valid_chunks = []
+            for i, chunk_data in enumerate(chunks):
+                emb = self.generate_embedding(chunk_data["text"])
+                if emb is not None:
+                    embeddings.append(emb)
+                    valid_chunks.append({
+                        "text": chunk_data["text"],
+                        "page_number": chunk_data["page_number"],
+                        "doc_title": title,
+                        "doc_path": file_path,
+                        "chunk_index": i
+                    })
+            if not embeddings:
+                logger.warning(f"No valid embeddings generated for {file_path}")
+                return False
+            embeddings = np.array(embeddings)
+            # Initialize or update FAISS index
+            if self.vector_store is None:
+                self.vector_store = faiss.IndexFlatL2(embeddings.shape[1])
+                self.vector_store.add(embeddings)
+            else:
+                self.vector_store.add(embeddings)
+            # Store metadata
+            start_idx = len(self.chunks)
+            self.chunks.extend(valid_chunks)
+            self.doc_metadata.append({
+                "title": title,
+                "path": file_path,
+                "chunk_count": len(valid_chunks),
+                "start_idx": start_idx,
+                "end_idx": start_idx + len(valid_chunks) - 1,
+                "total_pages": max([page["page_number"] for page in pages]) if pages else 1
+            })
+            # Save state after each document addition
+            self.save_state()
+            logger.info(f"Successfully added document '{title}' with {len(valid_chunks)} chunks")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to process document {file_path}: {e}")
+            return False
+    def search_chunks(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        """Search for relevant chunks using semantic similarity"""
+        if self.vector_store is None or not self.chunks:
+            logger.warning("No documents have been indexed yet")
+            return []
+        query_embedding = self.generate_embedding(query)
+        if query_embedding is None:
+            logger.error("Failed to generate embedding for the query")
+            return []
+        query_embedding = np.array([query_embedding])  # Convert to 2D array
+        # Search FAISS index
+        distances, indices = self.vector_store.search(query_embedding, top_k)
+        # Convert to similarity scores (FAISS returns squared L2 distances)
+        similarities = 1 / (1 + distances[0])
+        results = []
+        for idx, sim in zip(indices[0], similarities):
+            if idx < 0 or idx >= len(self.chunks):  # Invalid index
+                continue
+            chunk_data = self.chunks[idx]
+            results.append({
+                "text": chunk_data["text"],
+                "similarity": float(sim),
+                "doc_title": chunk_data["doc_title"],
+                "doc_path": chunk_data["doc_path"],
+                "chunk_index": chunk_data["chunk_index"],
+                "page_number": chunk_data["page_number"]
+            })
+        # Sort by similarity (highest first)
+        results.sort(key=lambda x: x["similarity"], reverse=True)
+        # Apply threshold
+        results = [r for r in results if r["similarity"] >= QUERY_SIMILARITY_THRESHOLD]
+        if not results and top_k > 0:
+            logger.info("No chunks met similarity threshold, returning top result anyway")
+            return results[:1]
+        return results
+class RAGSystem:
+    def __init__(self):
+        logger.info("Initializing RAG System...")
+        try:
+            self.doc_processor = DocumentProcessor(embedding_model_name=EMBEDDING_MODEL_NAME, device=DEVICE)
+            # Try to load existing state
+            if self.doc_processor.load_state():
+                logger.info("Successfully loaded existing document index")
+            else:
+                logger.info("Starting with a fresh document index")
+            logger.info(f"Loading Generative LLM: {GENERATIVE_MODEL_NAME} on {DEVICE}...")
+            try:
+                phi_tokenizer = AutoTokenizer.from_pretrained(GENERATIVE_MODEL_NAME, trust_remote_code=True)
+                model_kwargs = {"trust_remote_code": True}
+                if DEVICE == 'cuda':
+                    if torch.cuda.is_bf16_supported():
+                        logger.info("Using bfloat16 for Phi-2 model.")
+                        model_kwargs["torch_dtype"] = torch.bfloat16
+                    else:
+                        logger.info("Using float16 for Phi-2 model.")
+                        model_kwargs["torch_dtype"] = torch.float16
+                else:
+                    logger.info("Using float32 for Phi-2 model on CPU.")
+                    model_kwargs["torch_dtype"] = torch.float32
+                phi_model = AutoModelForCausalLM.from_pretrained(GENERATIVE_MODEL_NAME, **model_kwargs)
+                phi_model = phi_model.to(DEVICE)
+                pipeline_device_index = 0 if DEVICE == "cuda" else -1
+                self.phi_pipe = pipeline(
+                    "text-generation",
+                    model=phi_model,
+                    tokenizer=phi_tokenizer,
+                    device=pipeline_device_index
+                )
+                logger.info(f"✅ Generative LLM ({GENERATIVE_MODEL_NAME}) loaded successfully on {DEVICE}.")
+            except Exception as e:
+                logger.error(f"❌ Critical Error loading Phi-2 model: {e}")
+                logger.error("RAG Q&A functionality will be disabled.")
+                self.phi_pipe = None
+            logger.info("✅ RAG System initialized successfully.")
+        except Exception as e:
+            logger.critical(f"Failed to initialize RAG System: {e}", exc_info=True)
+            raise RuntimeError("System initialization failed.") from e
+    def add_document(self, file_path: str) -> bool:
+        """Add a document to the system"""
+        return self.doc_processor.add_document(file_path)
+    def ask_question(self, question: str, top_k: int = 3) -> Dict[str, Any]:
+        """Answer a question using RAG"""
+        if self.phi_pipe is None:
+            return {
+                "answer": "Error: The AI model is not available. Please check the logs.",
+                "sources": [],
+                "question_chunks": []
+            }
+        logger.info(f"Processing question: '{question[:100]}...'")
+        # Step 1: Retrieve relevant chunks
+        relevant_chunks = self.doc_processor.search_chunks(question, top_k)
+        if not relevant_chunks:
+            return {
+                "answer": "No relevant information found in documents to answer this question.",
+                "sources": [],
+                "question_chunks": []
+            }
+        # Step 2: Prepare context for generation
+        context = "\n\n---\n\n".join([
+            f"Document: {chunk['doc_title']} (Page {chunk['page_number']})\nChunk {chunk['chunk_index']} (Similarity: {chunk['similarity']:.2f})\n\n{chunk['text']}"
+            for chunk in relevant_chunks
+        ])
+        # Step 3: Generate answer with Phi-2
+        prompt = f"""You are a helpful assistant. Answer the question ONLY from the provided context. If the context is insufficient, just say you don't know.
+Context:
+{context}
+Question: {question}
+Answer: """
+        try:
+            output = self.phi_pipe(
+                prompt,
+                max_new_tokens=PHI_MAX_NEW_TOKENS,
+                temperature=PHI_TEMPERATURE,
+                do_sample=True,
+                return_full_text=False,
+                pad_token_id=self.phi_pipe.tokenizer.eos_token_id
+            )
+            generated_text = output[0]["generated_text"].strip()
+            # Post-processing to clean up the response
+            if "Question:" in generated_text:
+                generated_text = generated_text.split("Question:")[0].strip()
+            # Extract sources with page numbers
+            sources = []
+            seen_docs = set()
+            for chunk in relevant_chunks:
+                doc_key = f"{chunk['doc_title']}_page_{chunk['page_number']}"
+                if doc_key not in seen_docs:
+                    sources.append({
+                        "document": chunk['doc_title'],
+                        "page_number": chunk['page_number'],
+                        "path": chunk['doc_path'],
+                        "similarity": chunk['similarity']
+                    })
+                    seen_docs.add(doc_key)
+            # Prepare question chunks for display
+            question_chunks = []
+            for chunk in relevant_chunks:
+                question_chunks.append({
+                    "document": chunk['doc_title'],
+                    "page_number": chunk['page_number'],
+                    "chunk_index": chunk['chunk_index'],
+                    "similarity": chunk['similarity'],
+                    "text_preview": chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text']
+                })
+            return {
+                "answer": generated_text,
+                "sources": sources,
+                "question_chunks": question_chunks,
+                "relevant_chunks": relevant_chunks  # For debugging/explanation
+            }
+        except Exception as e:
+            logger.error(f"Error generating answer: {e}")
+            return {
+                "answer": f"Error generating answer: {str(e)}",
+                "sources": [],
+                "question_chunks": []
+            }
+    def explain_retrieval(self, question: str):
+        """Explain the retrieval process for educational purposes"""
+        print("\n=== RAG Process Explanation ===")
+        print(f"Question: {question}")
+        # Step 1: Show query embedding
+        print("\n1. Query Embedding:")
+        query_embedding = self.doc_processor.generate_embedding(question)
+        if query_embedding is not None:
+            print(f"- Generated {len(query_embedding)}-dimensional embedding vector")
+            print(f"- Sample values: {query_embedding[:5]}...")
+        else:
+            print("Failed to generate query embedding")
+            return
+        # Step 2: Show retrieval
+        print("\n2. Document Chunk Retrieval:")
+        chunks = self.doc_processor.search_chunks(question, top_k=3)
+        if not chunks:
+            print("No relevant chunks found")
+            return
+        print(f"Found {len(chunks)} relevant chunks:")
+        for i, chunk in enumerate(chunks, 1):
+            print(f"\nChunk {i}:")
+            print(f"- Source: {chunk['doc_title']} (Page {chunk['page_number']})")
+            print(f"- Chunk Index: {chunk['chunk_index']}")
+            print(f"- Similarity Score: {chunk['similarity']:.4f}")
+            print(f"- Text Preview: {chunk['text'][:150]}...")
+        # Step 3: Show context preparation
+        print("\n3. Context Preparation:")
+        print("The top chunks are combined into a context that will be sent to the LLM")
+        # Step 4: Show generation
+        print("\n4. Generation with Phi-2:")
+        print("The LLM is prompted to answer the question using ONLY the provided context")
+        print("This helps prevent hallucination by grounding the response in the retrieved documents")
+        # Show actual answer
+        result = self.ask_question(question)
+        print("\nFinal Answer:")
+        print(result['answer'])
+        print("\nSources with Page Numbers:")
+        for source in result['sources']:
+            print(f"- {source['document']} (Page {source['page_number']}, similarity: {source['similarity']:.2f})")
+    def list_documents(self) -> List[Dict[str, Any]]:
+        """List all indexed documents"""
+        return [{
+            "title": doc["title"],
+            "path": doc["path"],
+            "chunk_count": doc["chunk_count"],
+            "total_pages": doc.get("total_pages", 1)
+        } for doc in self.doc_processor.doc_metadata]
+    def clear_index(self) -> bool:
+        """Clear all indexed documents"""
+        return self.doc_processor.clear_state()
+    def close(self):
+        """Clean up resources"""
+        logger.info("Shutting down RAG System...")
+        # Save state before closing
+        self.doc_processor.save_state()
+        if hasattr(self, 'phi_pipe') and self.phi_pipe:
+            del self.phi_pipe
+        if hasattr(self.doc_processor, 'embedding_model'):
+            del self.doc_processor.embedding_model
+        if DEVICE == 'cuda':
+            torch.cuda.empty_cache()
+            logger.info("Cleared CUDA cache.")
+        logger.info("RAG System shut down.")
+def main():
+    rag_system = RAGSystem()
+    while True:
+        print("\n1. Add Document")
+        print("2. Ask Question")
+        print("3. Explain Retrieval Process")
+        print("4. List Indexed Documents")
+        print("5. Clear All Documents")
+        print("6. Exit")
+        choice = input("Enter your choice: ")
+        if choice == "1":
+            file_path = input("Enter document path (CSV, DOCX, PDF, etc.): ").strip('"')
+            if not os.path.exists(file_path):
+                print("File not found!")
+                continue
+            if rag_system.add_document(file_path):
+                print("Document added successfully!")
+            else:
+                print("Failed to add document")
+        elif choice == "2":
+            question = input("Enter your question: ")
+            result = rag_system.ask_question(question)
+            print("\nAnswer:", result["answer"])
+            if result["sources"]:
+                print("\nSources:")
+                for src in result["sources"]:
+                    print(f"- {src['document']} (Page {src['page_number']}, similarity: {src['similarity']:.2f})")
+            else:
+                print("(No sources cited)")
+            if result["question_chunks"]:
+                print("\nRelevant Chunks:")
+                for i, chunk in enumerate(result["question_chunks"], 1):
+                    print(f"{i}. {chunk['document']} (Page {chunk['page_number']}, Chunk {chunk['chunk_index']})")
+                    print(f"   Similarity: {chunk['similarity']:.2f}")
+                    print(f"   Preview: {chunk['text_preview']}")
+                    print()
+        elif choice == "3":
+            question = input("Enter a question to explain the retrieval process: ")
+            rag_system.explain_retrieval(question)
+        elif choice == "4":
+            docs = rag_system.list_documents()
+            if docs:
+                print("\nIndexed Documents:")
+                for i, doc in enumerate(docs, 1):
+                    print(f"{i}. {doc['title']} ({doc['chunk_count']} chunks, {doc['total_pages']} pages)")
+                    print(f"   Path: {doc['path']}")
+            else:
+                print("No documents indexed yet")
+        elif choice == "5":
+            confirm = input("Are you sure you want to clear ALL documents? (y/n): ")
+            if confirm.lower() == 'y':
+                if rag_system.clear_index():
+                    print("All documents cleared")
+                else:
+                    print("Failed to clear documents")
+        elif choice == "6":
+            rag_system.close()
+            break
+        else:
+            print("Invalid choice")
+if __name__ == "__main__":
+    main()

README.md CHANGED Viewed

@@ -1,11 +1,48 @@
----
-title: RAG QA
-emoji: 🌍
-colorFrom: purple
-colorTo: purple
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Document Q&A with RAG
+emoji: 📄
+colorFrom: indigo
+colorTo: blue     # ✅ valid color now
+sdk: gradio
+sdk_version: "4.28.0"
+app_file: app.py
+pinned: false
+---
+# Document Q&A with RAG System
+This is a Retrieval-Augmented Generation (RAG) system deployed on Hugging Face Spaces. It allows you to:
+1. Upload documents (PDF, DOCX, TXT, CSV)
+2. Ask questions about the content
+3. Get answers grounded in your documents
+## Features
+- Supports multiple document formats
+- Semantic search for relevant content
+- Generative answers using Phi-2 model
+- Persistent document storage
+- Web interface and API endpoints
+## How to Use
+1. Upload documents using the upload form
+2. Ask questions in natural language
+3. View answers with cited sources
+## Technical Details
+- Embedding model: `all-MiniLM-L12-v2`
+- Generative model: `microsoft/phi-2`
+- Vector store: FAISS
+- Web framework: FastAPI + Gradio
+## Deployment
+This app is automatically deployed on Hugging Face Spaces. To run locally:
+```bash
+pip install -r requirements.txt
+python app.py

__pycache__/RAG.cpython-312.pyc ADDED Viewed

Binary file (33.6 kB). View file

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (22.9 kB). View file

__pycache__/rag_system.cpython-312.pyc ADDED Viewed

Binary file (25.1 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,1379 @@

+# from fastapi import FastAPI, File, UploadFile, HTTPException, Form, Request
+# from fastapi.responses import HTMLResponse, JSONResponse
+# from fastapi.staticfiles import StaticFiles
+# from fastapi.templating import Jinja2Templates
+# from pydantic import BaseModel
+# import os
+# import tempfile
+# import shutil
+# from typing import List, Dict, Any
+# import logging
+# import sys
+# import os
+# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# try:
+#     from RAG import RAGSystem
+# except ImportError:
+#     print("Error: Cannot import RAGSystem from RAG.py")
+#     print("Make sure RAG.py is in the same directory as app.py")
+#     sys.exit(1)
+# logging.basicConfig(level=logging.DEBUG)
+# logger = logging.getLogger(__name__)
+# app = FastAPI(title="RAG PDF QA System")
+# # Setup templates directory
+# templates = Jinja2Templates(directory="templates")
+# # Try to mount static files directory
+# try:
+#     app.mount("/static", StaticFiles(directory="static"), name="static")
+# except Exception as e:
+#     logger.warning(f"Static files directory not found: {e}")
+# # Initialize RAG System
+# try:
+#     rag_system = RAGSystem()
+#     logger.info("RAG System initialized successfully")
+# except Exception as e:
+#     logger.error(f"Failed to initialize RAG System: {e}")
+#     rag_system = None
+# class QuestionRequest(BaseModel):
+#     question: str
+# @app.get("/", response_class=HTMLResponse)
+# async def read_root(request: Request):
+#     try:
+#         return templates.TemplateResponse("index.html", {"request": request})
+#     except Exception as e:
+#         logger.error(f"Error serving index.html from templates folder: {e}")
+#         return HTMLResponse(content=f"""
+#             <html>
+#                 <body>
+#                     <h1>RAG PDF QA System</h1>
+#                     <p>Error: Could not load index.html from templates folder</p>
+#                     <p>Error details: {str(e)}</p>
+#                     <p>Make sure you have:</p>
+#                     <ul>
+#                         <li>A 'templates' folder in the same directory as app.py</li>
+#                         <li>index.html file inside the templates folder</li>
+#                         <li>Installed jinja2: pip install jinja2</li>
+#                     </ul>
+#                 </body>
+#             </html>
+#         """)
+# @app.post("/upload")
+# async def upload_document(file: UploadFile = File(...)):
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         if not file.filename:
+#             raise HTTPException(status_code=400, detail="No file selected")
+#         allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
+#         file_extension = os.path.splitext(file.filename)[1].lower()
+#         if file_extension not in allowed_extensions:
+#             raise HTTPException(status_code=400, detail=f"File type {file_extension} not supported. Supported types: {', '.join(allowed_extensions)}")
+#         # Create temporary file
+#         with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
+#             shutil.copyfileobj(file.file, temp_file)
+#             temp_path = temp_file.name
+#         # Process document
+#         success = rag_system.add_document(temp_path)
+#         # Clean up temporary file
+#         try:
+#             os.unlink(temp_path)
+#         except Exception as cleanup_error:
+#             logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
+#         if success:
+#             return JSONResponse(content={"message": f"Document '{file.filename}' uploaded and processed successfully"})
+#         else:
+#             raise HTTPException(status_code=500, detail="Failed to process document")
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Upload error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.post("/ask")
+# async def ask_question(request: QuestionRequest):
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         if not request.question.strip():
+#             raise HTTPException(status_code=400, detail="Question cannot be empty")
+#         result = rag_system.ask_question(request.question)
+#         return JSONResponse(content={
+#             "answer": result["answer"],
+#             "sources": result["sources"]
+#         })
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Question error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.get("/documents")
+# async def get_documents():
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         docs = rag_system.list_documents()
+#         return JSONResponse(content={"documents": docs})
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Documents list error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.delete("/clear")
+# async def clear_documents():
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         success = rag_system.clear_index()
+#         if success:
+#             return JSONResponse(content={"message": "All documents cleared successfully"})
+#         else:
+#             raise HTTPException(status_code=500, detail="Failed to clear documents")
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Clear error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.get("/health")
+# async def health_check():
+#     return {
+#         "status": "healthy",
+#         "rag_system_initialized": rag_system is not None,
+#         "message": "RAG PDF QA System is running"
+#     }
+# if __name__ == "__main__":
+#     import uvicorn
+#     logger.info("Starting FastAPI server...")
+#     uvicorn.run(app, host="0.0.0.0", port=8000, log_level="debug")
+# # second code
+# from fastapi import FastAPI, File, UploadFile, HTTPException, Request
+# from fastapi.responses import HTMLResponse, JSONResponse
+# from fastapi.staticfiles import StaticFiles
+# from fastapi.templating import Jinja2Templates
+# from pydantic import BaseModel
+# import os
+# import tempfile
+# import shutil
+# from typing import List, Dict, Any
+# import logging
+# import sys
+# import os
+# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# try:
+#     from RAG import RAGSystem
+# except ImportError:
+#     print("Error: Cannot import RAGSystem from RAG.py")
+#     print("Make sure RAG.py is in the same directory as app.py")
+#     sys.exit(1)
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+# app = FastAPI(title="RAG PDF QA System")
+# # Setup templates directory
+# templates = Jinja2Templates(directory="templates")
+# # Try to mount static files directory
+# try:
+#     app.mount("/static", StaticFiles(directory="static"), name="static")
+# except Exception as e:
+#     logger.warning(f"Static files directory not found: {e}")
+# # Initialize RAG System
+# try:
+#     rag_system = RAGSystem()
+#     logger.info("RAG System initialized successfully")
+# except Exception as e:
+#     logger.error(f"Failed to initialize RAG System: {e}")
+#     rag_system = None
+# class QuestionRequest(BaseModel):
+#     question: str
+#     top_k: int = 3
+# @app.get("/", response_class=HTMLResponse)
+# async def read_root(request: Request):
+#     try:
+#         return templates.TemplateResponse("index.html", {"request": request})
+#     except Exception as e:
+#         logger.error(f"Error serving index.html from templates folder: {e}")
+#         return HTMLResponse(content=f"""
+#             <html>
+#                 <body>
+#                     <h1>RAG PDF QA System</h1>
+#                     <p>Error: Could not load index.html from templates folder</p>
+#                     <p>Error details: {str(e)}</p>
+#                     <p>Make sure you have:</p>
+#                     <ul>
+#                         <li>A 'templates' folder in the same directory as app.py</li>
+#                         <li>index.html file inside the templates folder</li>
+#                         <li>Installed jinja2: pip install jinja2</li>
+#                     </ul>
+#                 </body>
+#             </html>
+#         """)
+# @app.post("/upload")
+# async def upload_document(file: UploadFile = File(...)):
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         if not file.filename:
+#             raise HTTPException(status_code=400, detail="No file selected")
+#         allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
+#         file_extension = os.path.splitext(file.filename)[1].lower()
+#         if file_extension not in allowed_extensions:
+#             raise HTTPException(status_code=400, detail=f"File type {file_extension} not supported. Supported types: {', '.join(allowed_extensions)}")
+#         # Create temporary file
+#         with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
+#             shutil.copyfileobj(file.file, temp_file)
+#             temp_path = temp_file.name
+#         # Process document
+#         success = rag_system.add_document(temp_path)
+#         # Clean up temporary file
+#         try:
+#             os.unlink(temp_path)
+#         except Exception as cleanup_error:
+#             logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
+#         if success:
+#             return JSONResponse(content={"message": f"Document '{file.filename}' uploaded and processed successfully"})
+#         else:
+#             raise HTTPException(status_code=500, detail="Failed to process document")
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Upload error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.post("/ask")
+# async def ask_question(request: QuestionRequest):
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         if not request.question.strip():
+#             raise HTTPException(status_code=400, detail="Question cannot be empty")
+#         result = rag_system.ask_question(request.question, top_k=request.top_k)
+#         return JSONResponse(content={
+#             "answer": result["answer"],
+#             "sources": result["sources"],
+#             "question_chunks": result.get("question_chunks", []),
+#             "relevant_chunks": result.get("relevant_chunks", [])
+#         })
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Question error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.get("/documents")
+# async def get_documents():
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         docs = rag_system.list_documents()
+#         return JSONResponse(content={"documents": docs})
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Documents list error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.delete("/clear")
+# async def clear_documents():
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         success = rag_system.clear_index()
+#         if success:
+#             return JSONResponse(content={"message": "All documents cleared successfully"})
+#         else:
+#             raise HTTPException(status_code=500, detail="Failed to clear documents")
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Clear error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.post("/search")
+# async def search_chunks(request: QuestionRequest):
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         if not request.question.strip():
+#             raise HTTPException(status_code=400, detail="Search query cannot be empty")
+#         chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
+#         return JSONResponse(content={
+#             "query": request.question,
+#             "chunks": chunks,
+#             "total_found": len(chunks)
+#         })
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Search error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.get("/health")
+# async def health_check():
+#     return {
+#         "status": "healthy",
+#         "rag_system_initialized": rag_system is not None,
+#         "message": "RAG PDF QA System is running",
+#         "indexed_documents": len(rag_system.list_documents()) if rag_system else 0
+#     }
+# @app.get("/stats")
+# async def get_stats():
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="RAG System not initialized")
+#         docs = rag_system.list_documents()
+#         total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
+#         total_pages = sum(doc.get("total_pages", 1) for doc in docs)
+#         return JSONResponse(content={
+#             "total_documents": len(docs),
+#             "total_chunks": total_chunks,
+#             "total_pages": total_pages,
+#             "documents": docs
+#         })
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Stats error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.on_event("shutdown")
+# async def shutdown_event():
+#     if rag_system:
+#         rag_system.close()
+#         logger.info("RAG System closed gracefully")
+# if __name__ == "__main__":
+#     import uvicorn
+#     logger.info("Starting FastAPI server...")
+#     uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")
+# complate code
+# from fastapi import FastAPI, File, UploadFile, HTTPException, Request
+# from fastapi.responses import HTMLResponse, JSONResponse
+# from fastapi.staticfiles import StaticFiles
+# from fastapi.templating import Jinja2Templates
+# from pydantic import BaseModel
+# import os
+# import tempfile
+# import shutil
+# from typing import List, Dict, Any, Optional, Union
+# import logging
+# from datetime import datetime
+# import mimetypes
+# # Import your RAG system
+# import sys
+# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# try:
+#     from RAG import RAGSystem
+# except ImportError:
+#     print("Error: Cannot import RAGSystem from RAG.py")
+#     print("Make sure RAG.py is in the same directory as app.py")
+#     sys.exit(1)
+# # Configure logging
+# logging.basicConfig(
+#     level=logging.INFO,
+#     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+# )
+# logger = logging.getLogger(__name__)
+# # Initialize FastAPI app
+# app = FastAPI(
+#     title="Scholar's Archive - Document Intelligence System",
+#     description="A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology",
+#     version="1.0.0",
+#     docs_url="/api/docs",
+#     redoc_url="/api/redoc"
+# )
+# # Setup templates directory
+# templates = Jinja2Templates(directory="templates")
+# # Try to mount static files directory
+# try:
+#     app.mount("/static", StaticFiles(directory="static"), name="static")
+# except Exception as e:
+#     logger.warning(f"Static files directory not found: {e}")
+# # Initialize RAG System
+# try:
+#     rag_system = RAGSystem()
+#     logger.info("Scholar's Archive RAG System initialized successfully")
+# except Exception as e:
+#     logger.error(f"Failed to initialize RAG System: {e}")
+#     rag_system = None
+# # Pydantic models
+# class QuestionRequest(BaseModel):
+#     question: str
+#     top_k: int = 3
+# class DocumentInfo(BaseModel):
+#     title: str
+#     file_type: str
+#     upload_date: str
+#     chunk_count: int
+#     total_pages: Optional[int] = None
+# # Fixed AnswerResponse model to handle both strings and dictionaries
+# class AnswerResponse(BaseModel):
+#     answer: str
+#     sources: List[Dict[str, Any]]
+#     question_chunks: List[Union[str, Dict[str, Any]]] = []
+#     relevant_chunks: List[Union[str, Dict[str, Any]]] = []
+# class StatsResponse(BaseModel):
+#     total_documents: int
+#     total_chunks: int
+#     total_pages: int
+#     documents: List[DocumentInfo]
+# # Utility functions
+# def get_file_type_icon(filename: str) -> str:
+#     """Get appropriate icon for file type"""
+#     ext = os.path.splitext(filename)[1].lower()
+#     icons = {
+#         '.pdf': 'fas fa-file-pdf',
+#         '.docx': 'fas fa-file-word',
+#         '.txt': 'fas fa-file-alt',
+#         '.csv': 'fas fa-file-csv'
+#     }
+#     return icons.get(ext, 'fas fa-file')
+# def format_file_size(size_bytes: int) -> str:
+#     """Format file size in human readable format"""
+#     if size_bytes == 0:
+#         return "0 B"
+#     size_names = ["B", "KB", "MB", "GB"]
+#     i = 0
+#     while size_bytes >= 1024 and i < len(size_names) - 1:
+#         size_bytes /= 1024.0
+#         i += 1
+#     return f"{size_bytes:.1f} {size_names[i]}"
+# def extract_content_from_chunks(chunks):
+#     """Extract string content from chunk data structures"""
+#     if not chunks:
+#         return []
+#     extracted = []
+#     for chunk in chunks:
+#         if isinstance(chunk, str):
+#             extracted.append(chunk)
+#         elif isinstance(chunk, dict):
+#             # Try different possible keys for text content
+#             content = chunk.get('text') or chunk.get('content') or chunk.get('document') or str(chunk)
+#             extracted.append(content)
+#         else:
+#             extracted.append(str(chunk))
+#     return extracted
+# # Routes
+# @app.get("/", response_class=HTMLResponse)
+# async def read_root(request: Request):
+#     """Serve the main classical interface"""
+#     try:
+#         return templates.TemplateResponse("index.html", {"request": request})
+#     except Exception as e:
+#         logger.error(f"Error serving index.html from templates folder: {e}")
+#         # Return the embedded HTML if templates folder is not available
+#         with open("scholar_archive.html", "r", encoding="utf-8") as f:
+#             html_content = f.read()
+#         return HTMLResponse(content=html_content)
+# @app.post("/upload")
+# async def upload_document(file: UploadFile = File(...)):
+#     """Upload and process a document"""
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+#         if not file.filename:
+#             raise HTTPException(status_code=400, detail="No file selected")
+#         # Validate file type
+#         allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
+#         file_extension = os.path.splitext(file.filename)[1].lower()
+#         if file_extension not in allowed_extensions:
+#             raise HTTPException(
+#                 status_code=400,
+#                 detail=f"File type {file_extension} not supported. Supported formats: {', '.join(allowed_extensions)}"
+#             )
+#         # Check file size (limit to 50MB)
+#         file_size = 0
+#         content = await file.read()
+#         file_size = len(content)
+#         if file_size > 50 * 1024 * 1024:  # 50MB limit
+#             raise HTTPException(status_code=400, detail="File size too large. Maximum size is 50MB")
+#         # Create temporary file
+#         with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
+#             temp_file.write(content)
+#             temp_path = temp_file.name
+#         logger.info(f"Processing document: {file.filename} ({format_file_size(file_size)})")
+#         # Process document
+#         success = rag_system.add_document(temp_path)
+#         # Clean up temporary file
+#         try:
+#             os.unlink(temp_path)
+#         except Exception as cleanup_error:
+#             logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
+#         if success:
+#             logger.info(f"Successfully processed document: {file.filename}")
+#             return JSONResponse(content={
+#                 "message": f"Document '{file.filename}' has been successfully added to the Scholar's Archive",
+#                 "filename": file.filename,
+#                 "size": format_file_size(file_size),
+#                 "type": file_extension
+#             })
+#         else:
+#             raise HTTPException(status_code=500, detail="Failed to process document")
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Upload error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.post("/ask", response_model=AnswerResponse)
+# async def ask_question(request: QuestionRequest):
+#     """Ask a question about the uploaded documents"""
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+#         if not request.question.strip():
+#             raise HTTPException(status_code=400, detail="Question cannot be empty")
+#         logger.info(f"Processing question: {request.question[:100]}...")
+#         # Get answer from RAG system
+#         result = rag_system.ask_question(request.question, top_k=request.top_k)
+#         # Handle the chunks data properly
+#         question_chunks = result.get("question_chunks", [])
+#         relevant_chunks = result.get("relevant_chunks", [])
+#         # Log the structure to understand what we're getting
+#         logger.info(f"Question chunks type: {type(question_chunks)}")
+#         logger.info(f"Relevant chunks type: {type(relevant_chunks)}")
+#         if question_chunks:
+#             logger.info(f"First question chunk type: {type(question_chunks[0])}")
+#         if relevant_chunks:
+#             logger.info(f"First relevant chunk type: {type(relevant_chunks[0])}")
+#         # Format the response - keep original structure but ensure it's serializable
+#         response = AnswerResponse(
+#             answer=result["answer"],
+#             sources=result["sources"],
+#             question_chunks=question_chunks,
+#             relevant_chunks=relevant_chunks
+#         )
+#         logger.info(f"Successfully answered question with {len(result['sources'])} sources")
+#         return response
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Question processing error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.get("/documents")
+# async def get_documents():
+#     """Get list of all uploaded documents"""
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+#         docs = rag_system.list_documents()
+#         # Format documents with additional metadata
+#         formatted_docs = []
+#         for doc in docs:
+#             formatted_doc = {
+#                 "title": doc.get("title", "Unknown Document"),
+#                 "chunk_count": doc.get("chunk_count", 0),
+#                 "total_pages": doc.get("total_pages"),
+#                 "file_type": os.path.splitext(doc.get("title", ""))[1].lower(),
+#                 "upload_date": doc.get("upload_date", datetime.now().isoformat()),
+#                 "icon": get_file_type_icon(doc.get("title", ""))
+#             }
+#             formatted_docs.append(formatted_doc)
+#         return JSONResponse(content={"documents": formatted_docs})
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Documents list error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.get("/stats", response_model=StatsResponse)
+# async def get_stats():
+#     """Get statistics about the document collection"""
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+#         docs = rag_system.list_documents()
+#         total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
+#         total_pages = sum(doc.get("total_pages", 1) for doc in docs if doc.get("total_pages"))
+#         # Format documents
+#         formatted_docs = []
+#         for doc in docs:
+#             formatted_doc = DocumentInfo(
+#                 title=doc.get("title", "Unknown Document"),
+#                 file_type=os.path.splitext(doc.get("title", ""))[1].lower(),
+#                 upload_date=doc.get("upload_date", datetime.now().isoformat()),
+#                 chunk_count=doc.get("chunk_count", 0),
+#                 total_pages=doc.get("total_pages")
+#             )
+#             formatted_docs.append(formatted_doc)
+#         stats = StatsResponse(
+#             total_documents=len(docs),
+#             total_chunks=total_chunks,
+#             total_pages=total_pages,
+#             documents=formatted_docs
+#         )
+#         return stats
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Stats error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.delete("/clear")
+# async def clear_documents():
+#     """Clear all documents from the archive"""
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+#         logger.info("Clearing all documents from Scholar's Archive")
+#         success = rag_system.clear_index()
+#         if success:
+#             logger.info("Successfully cleared all documents")
+#             return JSONResponse(content={
+#                 "message": "All documents have been successfully removed from the Scholar's Archive"
+#             })
+#         else:
+#             raise HTTPException(status_code=500, detail="Failed to clear documents")
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Clear error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.post("/search")
+# async def search_chunks(request: QuestionRequest):
+#     """Search for relevant document chunks"""
+#     try:
+#         if rag_system is None:
+#             raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+#         if not request.question.strip():
+#             raise HTTPException(status_code=400, detail="Search query cannot be empty")
+#         logger.info(f"Searching chunks for: {request.question[:100]}...")
+#         chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
+#         # Format chunks with additional metadata
+#         formatted_chunks = []
+#         for chunk in chunks:
+#             formatted_chunk = {
+#                 "content": chunk.get("content", ""),
+#                 "document": chunk.get("document", "Unknown"),
+#                 "similarity": chunk.get("similarity", 0.0),
+#                 "page": chunk.get("page"),
+#                 "chunk_index": chunk.get("chunk_index")
+#             }
+#             formatted_chunks.append(formatted_chunk)
+#         return JSONResponse(content={
+#             "query": request.question,
+#             "chunks": formatted_chunks,
+#             "total_found": len(formatted_chunks)
+#         })
+#     except HTTPException:
+#         raise
+#     except Exception as e:
+#         logger.error(f"Search error: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+# @app.get("/health")
+# async def health_check():
+#     """Health check endpoint"""
+#     try:
+#         doc_count = len(rag_system.list_documents()) if rag_system else 0
+#         return {
+#             "status": "healthy",
+#             "service": "Scholar's Archive - Document Intelligence System",
+#             "version": "1.0.0",
+#             "rag_system_initialized": rag_system is not None,
+#             "indexed_documents": doc_count,
+#             "timestamp": datetime.now().isoformat(),
+#             "message": "Scholar's Archive is operational and ready to serve"
+#         }
+#     except Exception as e:
+#         logger.error(f"Health check error: {e}")
+#         return {
+#             "status": "degraded",
+#             "service": "Scholar's Archive - Document Intelligence System",
+#             "error": str(e),
+#             "timestamp": datetime.now().isoformat()
+#         }
+# @app.get("/api/info")
+# async def api_info():
+#     """Get API information"""
+#     return {
+#         "name": "Scholar's Archive API",
+#         "description": "Document Intelligence System API",
+#         "version": "1.0.0",
+#         "endpoints": {
+#             "upload": "POST /upload - Upload documents",
+#             "ask": "POST /ask - Ask questions",
+#             "documents": "GET /documents - List documents",
+#             "stats": "GET /stats - Get statistics",
+#             "search": "POST /search - Search chunks",
+#             "clear": "DELETE /clear - Clear all documents",
+#             "health": "GET /health - Health check"
+#         },
+#         "supported_formats": [".pdf", ".docx", ".txt", ".csv"],
+#         "max_file_size": "50MB"
+#     }
+# # Event handlers
+# @app.on_event("startup")
+# async def startup_event():
+#     """Application startup event"""
+#     logger.info("Starting Scholar's Archive - Document Intelligence System")
+#     logger.info("System initialized and ready to serve scholarly inquiries")
+# @app.on_event("shutdown")
+# async def shutdown_event():
+#     """Application shutdown event"""
+#     if rag_system:
+#         rag_system.close()
+#         logger.info("Scholar's Archive system closed gracefully")
+#     logger.info("Scholar's Archive shutdown complete")
+# # Error handlers
+# @app.exception_handler(404)
+# async def not_found_handler(request: Request, exc):
+#     """Custom 404 handler"""
+#     return JSONResponse(
+#         status_code=404,
+#         content={
+#             "detail": "The requested resource was not found in the Scholar's Archive",
+#             "path": str(request.url.path)
+#         }
+#     )
+# @app.exception_handler(500)
+# async def internal_error_handler(request: Request, exc):
+#     """Custom 500 handler"""
+#     logger.error(f"Internal server error: {exc}")
+#     return JSONResponse(
+#         status_code=500,
+#         content={
+#             "detail": "An internal error occurred in the Scholar's Archive system",
+#             "message": "Please try again later or contact support"
+#         }
+#     )
+# # Main execution
+# if __name__ == "__main__":
+#     import uvicorn
+#     logger.info("Launching Scholar's Archive - Document Intelligence System")
+#     logger.info("Access the interface at: http://localhost:8000")
+#     logger.info("API documentation at: http://localhost:8000/api/docs")
+#     uvicorn.run(
+#         app,
+#         host="0.0.0.0",
+#         port=7860,
+#         log_level="info",
+#         reload=False,
+#         access_log=True
+#     )
+# perfect code
+from fastapi import FastAPI, File, UploadFile, HTTPException, Request
+from fastapi.responses import HTMLResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from pydantic import BaseModel
+import os
+import tempfile
+import shutil
+from typing import List, Dict, Any, Optional, Union
+import logging
+from datetime import datetime
+import mimetypes
+# Import your RAG system
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+try:
+    from RAG import RAGSystem
+except ImportError:
+    print("Error: Cannot import RAGSystem from RAG.py")
+    print("Make sure RAG.py is in the same directory as app.py")
+    sys.exit(1)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(
+    title="Scholar's Archive - Document Intelligence System",
+    description="A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology",
+    version="1.0.0",
+    docs_url="/api/docs",
+    redoc_url="/api/redoc"
+)
+# Setup templates directory
+templates = Jinja2Templates(directory="templates")
+# Try to mount static files directory
+try:
+    app.mount("/static", StaticFiles(directory="static"), name="static")
+except Exception as e:
+    logger.warning(f"Static files directory not found: {e}")
+# Initialize RAG System
+try:
+    rag_system = RAGSystem()
+    logger.info("Scholar's Archive RAG System initialized successfully")
+except Exception as e:
+    logger.error(f"Failed to initialize RAG System: {e}")
+    rag_system = None
+# Pydantic models
+class QuestionRequest(BaseModel):
+    question: str
+    top_k: int = 3
+class DocumentInfo(BaseModel):
+    title: str
+    file_type: str
+    upload_date: str
+    chunk_count: int
+    total_pages: Optional[int] = None
+class AnswerResponse(BaseModel):
+    answer: str
+    sources: List[Dict[str, Any]]
+    question_chunks: List[Union[str, Dict[str, Any]]] = []
+    relevant_chunks: List[Union[str, Dict[str, Any]]] = []
+class StatsResponse(BaseModel):
+    total_documents: int
+    total_chunks: int
+    total_pages: int
+    documents: List[DocumentInfo]
+# Utility functions
+def get_file_type_icon(filename: str) -> str:
+    """Get appropriate icon for file type"""
+    ext = os.path.splitext(filename)[1].lower()
+    icons = {
+        '.pdf': 'fas fa-file-pdf',
+        '.docx': 'fas fa-file-word',
+        '.txt': 'fas fa-file-alt',
+        '.csv': 'fas fa-file-csv'
+    }
+    return icons.get(ext, 'fas fa-file')
+def format_file_size(size_bytes: int) -> str:
+    """Format file size in human readable format"""
+    if size_bytes == 0:
+        return "0 B"
+    size_names = ["B", "KB", "MB", "GB"]
+    i = 0
+    while size_bytes >= 1024 and i < len(size_names) - 1:
+        size_bytes /= 1024.0
+        i += 1
+    return f"{size_bytes:.1f} {size_names[i]}"
+def extract_content_from_chunks(chunks):
+    """Extract string content from chunk data structures"""
+    if not chunks:
+        return []
+    extracted = []
+    for chunk in chunks:
+        if isinstance(chunk, str):
+            extracted.append(chunk)
+        elif isinstance(chunk, dict):
+            # Try different possible keys for text content
+            content = chunk.get('text') or chunk.get('content') or chunk.get('document') or str(chunk)
+            extracted.append(content)
+        else:
+            extracted.append(str(chunk))
+    return extracted
+# Routes
+@app.get("/", response_class=HTMLResponse)
+async def read_root(request: Request):
+    """Serve the main classical interface"""
+    try:
+        return templates.TemplateResponse("index.html", {"request": request})
+    except Exception as e:
+        logger.error(f"Error serving index.html from templates folder: {e}")
+        # Return the embedded HTML if templates folder is not available
+        with open("scholar_archive.html", "r", encoding="utf-8") as f:
+            html_content = f.read()
+        return HTMLResponse(content=html_content)
+@app.post("/upload")
+async def upload_document(file: UploadFile = File(...)):
+    """Upload and process a document"""
+    try:
+        if rag_system is None:
+            raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+        if not file.filename:
+            raise HTTPException(status_code=400, detail="No file selected")
+        # Validate file type
+        allowed_extensions = ['.pdf', '.docx', '.txt', '.csv']
+        file_extension = os.path.splitext(file.filename)[1].lower()
+        if file_extension not in allowed_extensions:
+            raise HTTPException(
+                status_code=400,
+                detail=f"File type {file_extension} not supported. Supported formats: {', '.join(allowed_extensions)}"
+            )
+        # Check file size (limit to 50MB)
+        file_size = 0
+        content = await file.read()
+        file_size = len(content)
+        if file_size > 50 * 1024 * 1024:  # 50MB limit
+            raise HTTPException(status_code=400, detail="File size too large. Maximum size is 50MB")
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
+            temp_file.write(content)
+            temp_path = temp_file.name
+        logger.info(f"Processing document: {file.filename} ({format_file_size(file_size)})")
+        # Process document
+        success = rag_system.add_document(temp_path)
+        # Clean up temporary file
+        try:
+            os.unlink(temp_path)
+        except Exception as cleanup_error:
+            logger.warning(f"Failed to cleanup temp file: {cleanup_error}")
+        if success:
+            logger.info(f"Successfully processed document: {file.filename}")
+            return JSONResponse(content={
+                "message": f"Document '{file.filename}' has been successfully added to the Scholar's Archive",
+                "filename": file.filename,
+                "size": format_file_size(file_size),
+                "type": file_extension
+            })
+        else:
+            raise HTTPException(status_code=500, detail="Failed to process document")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Upload error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.post("/ask", response_model=AnswerResponse)
+async def ask_question(request: QuestionRequest):
+    """Ask a question about the uploaded documents"""
+    try:
+        if rag_system is None:
+            raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+        if not request.question.strip():
+            raise HTTPException(status_code=400, detail="Question cannot be empty")
+        logger.info(f"Processing question: {request.question[:100]}...")
+        # Get answer from RAG system
+        result = rag_system.ask_question(request.question, top_k=request.top_k)
+        # Handle the chunks data properly
+        question_chunks = result.get("question_chunks", [])
+        relevant_chunks = result.get("relevant_chunks", [])
+        # Add page numbers to sources
+        sources = result.get("sources", [])
+        for source in sources:
+            if isinstance(source, dict):
+                page_num = source.get('page')
+                if page_num:
+                    source['page_reference'] = f"Page {page_num}"
+        # Log the structure to understand what we're getting
+        logger.info(f"Question chunks type: {type(question_chunks)}")
+        logger.info(f"Relevant chunks type: {type(relevant_chunks)}")
+        if question_chunks:
+            logger.info(f"First question chunk type: {type(question_chunks[0])}")
+        if relevant_chunks:
+            logger.info(f"First relevant chunk type: {type(relevant_chunks[0])}")
+        # Format the response - keep original structure but ensure it's serializable
+        response = AnswerResponse(
+            answer=result["answer"],
+            sources=sources,
+            question_chunks=question_chunks,
+            relevant_chunks=relevant_chunks
+        )
+        logger.info(f"Successfully answered question with {len(sources)} sources")
+        return response
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Question processing error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.get("/documents")
+async def get_documents():
+    """Get list of all uploaded documents"""
+    try:
+        if rag_system is None:
+            raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+        docs = rag_system.list_documents()
+        # Format documents with additional metadata
+        formatted_docs = []
+        for doc in docs:
+            formatted_doc = {
+                "title": doc.get("title", "Unknown Document"),
+                "chunk_count": doc.get("chunk_count", 0),
+                "total_pages": doc.get("total_pages"),
+                "file_type": os.path.splitext(doc.get("title", ""))[1].lower(),
+                "upload_date": doc.get("upload_date", datetime.now().isoformat()),
+                "icon": get_file_type_icon(doc.get("title", ""))
+            }
+            formatted_docs.append(formatted_doc)
+        return JSONResponse(content={"documents": formatted_docs})
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Documents list error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.get("/stats", response_model=StatsResponse)
+async def get_stats():
+    """Get statistics about the document collection"""
+    try:
+        if rag_system is None:
+            raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+        docs = rag_system.list_documents()
+        total_chunks = sum(doc.get("chunk_count", 0) for doc in docs)
+        total_pages = sum(doc.get("total_pages", 1) for doc in docs if doc.get("total_pages"))
+        # Format documents
+        formatted_docs = []
+        for doc in docs:
+            formatted_doc = DocumentInfo(
+                title=doc.get("title", "Unknown Document"),
+                file_type=os.path.splitext(doc.get("title", ""))[1].lower(),
+                upload_date=doc.get("upload_date", datetime.now().isoformat()),
+                chunk_count=doc.get("chunk_count", 0),
+                total_pages=doc.get("total_pages")
+            )
+            formatted_docs.append(formatted_doc)
+        stats = StatsResponse(
+            total_documents=len(docs),
+            total_chunks=total_chunks,
+            total_pages=total_pages,
+            documents=formatted_docs
+        )
+        return stats
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Stats error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.delete("/clear")
+async def clear_documents():
+    """Clear all documents from the archive"""
+    try:
+        if rag_system is None:
+            raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+        logger.info("Clearing all documents from Scholar's Archive")
+        success = rag_system.clear_index()
+        if success:
+            logger.info("Successfully cleared all documents")
+            return JSONResponse(content={
+                "message": "All documents have been successfully removed from the Scholar's Archive"
+            })
+        else:
+            raise HTTPException(status_code=500, detail="Failed to clear documents")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Clear error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.post("/search")
+async def search_chunks(request: QuestionRequest):
+    """Search for relevant document chunks"""
+    try:
+        if rag_system is None:
+            raise HTTPException(status_code=500, detail="Scholar's Archive system not initialized")
+        if not request.question.strip():
+            raise HTTPException(status_code=400, detail="Search query cannot be empty")
+        logger.info(f"Searching chunks for: {request.question[:100]}...")
+        chunks = rag_system.doc_processor.search_chunks(request.question, top_k=request.top_k)
+        # Format chunks with additional metadata
+        formatted_chunks = []
+        for chunk in chunks:
+            formatted_chunk = {
+                "content": chunk.get("content", ""),
+                "document": chunk.get("document", "Unknown"),
+                "similarity": chunk.get("similarity", 0.0),
+                "page": chunk.get("page"),
+                "chunk_index": chunk.get("chunk_index")
+            }
+            formatted_chunks.append(formatted_chunk)
+        return JSONResponse(content={
+            "query": request.question,
+            "chunks": formatted_chunks,
+            "total_found": len(formatted_chunks)
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Search error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    try:
+        doc_count = len(rag_system.list_documents()) if rag_system else 0
+        return {
+            "status": "healthy",
+            "service": "Scholar's Archive - Document Intelligence System",
+            "version": "1.0.0",
+            "rag_system_initialized": rag_system is not None,
+            "indexed_documents": doc_count,
+            "timestamp": datetime.now().isoformat(),
+            "message": "Scholar's Archive is operational and ready to serve"
+        }
+    except Exception as e:
+        logger.error(f"Health check error: {e}")
+        return {
+            "status": "degraded",
+            "service": "Scholar's Archive - Document Intelligence System",
+            "error": str(e),
+            "timestamp": datetime.now().isoformat()
+        }
+@app.get("/api/info")
+async def api_info():
+    """Get API information"""
+    return {
+        "name": "Scholar's Archive API",
+        "description": "Document Intelligence System API",
+        "version": "1.0.0",
+        "endpoints": {
+            "upload": "POST /upload - Upload documents",
+            "ask": "POST /ask - Ask questions",
+            "documents": "GET /documents - List documents",
+            "stats": "GET /stats - Get statistics",
+            "search": "POST /search - Search chunks",
+            "clear": "DELETE /clear - Clear all documents",
+            "health": "GET /health - Health check"
+        },
+        "supported_formats": [".pdf", ".docx", ".txt", ".csv"],
+        "max_file_size": "50MB"
+    }
+# Event handlers
+@app.on_event("startup")
+async def startup_event():
+    """Application startup event"""
+    logger.info("Starting Scholar's Archive - Document Intelligence System")
+    logger.info("System initialized and ready to serve scholarly inquiries")
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Application shutdown event"""
+    if rag_system:
+        rag_system.close()
+        logger.info("Scholar's Archive system closed gracefully")
+    logger.info("Scholar's Archive shutdown complete")
+# Error handlers
+@app.exception_handler(404)
+async def not_found_handler(request: Request, exc):
+    """Custom 404 handler"""
+    return JSONResponse(
+        status_code=404,
+        content={
+            "detail": "The requested resource was not found in the Scholar's Archive",
+            "path": str(request.url.path)
+        }
+    )
+@app.exception_handler(500)
+async def internal_error_handler(request: Request, exc):
+    """Custom 500 handler"""
+    logger.error(f"Internal server error: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content={
+            "detail": "An internal error occurred in the Scholar's Archive system",
+            "message": "Please try again later or contact support"
+        }
+    )
+# Main execution
+if __name__ == "__main__":
+    import uvicorn
+    logger.info("Launching Scholar's Archive - Document Intelligence System")
+    logger.info("Access the interface at: http://localhost:8000")
+    logger.info("API documentation at: http://localhost:8000/api/docs")
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=7860,
+        log_level="info",
+        reload=False,
+        access_log=True
+    )

rag_storage/metadata.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea47f16e743fe3b0236ad60c5bf23262ac2a654bf3471d0ef3da50af2813bd3f
+size 53947

rag_storage/vector_store.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51fa7faf223455f98a8e566eca56bcbb9604a37dbb2a41c80bf32bfd60454d8d
+size 365613

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi
+uvicorn
+gradio==4.44.1
+sentence-transformers
+nltk
+pymupdf
+numpy
+faiss-cpu
+torch
+transformers
+unstructured
+python-multipart
+Jinja2

templates/index.html ADDED Viewed

	@@ -0,0 +1,1338 @@

+<!-- <!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>RAG PDF Question Answering System</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: 'Times New Roman', serif;
+            line-height: 1.6;
+            color: #333;
+            background-color: #f8f9fa;
+            padding: 20px;
+        }
+        .container {
+            max-width: 1000px;
+            margin: 0 auto;
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+            overflow: hidden;
+        }
+        .header {
+            background: linear-gradient(135deg, #2c3e50, #34495e);
+            color: white;
+            padding: 30px;
+            text-align: center;
+        }
+        .header h1 {
+            font-size: 2.2em;
+            margin-bottom: 10px;
+            font-weight: normal;
+        }
+        .header p {
+            font-size: 1.1em;
+            opacity: 0.9;
+        }
+        .main-content {
+            padding: 30px;
+        }
+        .section {
+            margin-bottom: 30px;
+            padding: 25px;
+            border: 1px solid #e0e0e0;
+            border-radius: 6px;
+            background: #fafafa;
+        }
+        .section h2 {
+            color: #2c3e50;
+            margin-bottom: 15px;
+            font-size: 1.4em;
+            border-bottom: 2px solid #3498db;
+            padding-bottom: 8px;
+        }
+        .upload-area {
+            border: 2px dashed #bdc3c7;
+            border-radius: 6px;
+            padding: 20px;
+            text-align: center;
+            background: white;
+            margin-bottom: 15px;
+            transition: border-color 0.3s ease;
+        }
+        .upload-area:hover {
+            border-color: #3498db;
+        }
+        .file-input {
+            margin: 10px 0;
+        }
+        input[type="file"] {
+            padding: 8px;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+            font-family: inherit;
+        }
+        .btn {
+            background: #3498db;
+            color: white;
+            border: none;
+            padding: 12px 24px;
+            border-radius: 4px;
+            cursor: pointer;
+            font-size: 1em;
+            font-family: inherit;
+            transition: background 0.3s ease;
+        }
+        .btn:hover {
+            background: #2980b9;
+        }
+        .btn:disabled {
+            background: #bdc3c7;
+            cursor: not-allowed;
+        }
+        .btn-danger {
+            background: #e74c3c;
+        }
+        .btn-danger:hover {
+            background: #c0392b;
+        }
+        .question-area {
+            background: white;
+            padding: 20px;
+            border-radius: 6px;
+            border: 1px solid #ddd;
+        }
+        textarea {
+            width: 100%;
+            padding: 12px;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+            font-family: inherit;
+            font-size: 1em;
+            resize: vertical;
+            min-height: 80px;
+        }
+        .answer-section {
+            background: white;
+            padding: 20px;
+            border-radius: 6px;
+            border: 1px solid #ddd;
+            margin-top: 15px;
+        }
+        .answer {
+            background: #f8f9fa;
+            padding: 15px;
+            border-left: 4px solid #3498db;
+            margin-bottom: 15px;
+            border-radius: 0 4px 4px 0;
+        }
+        .sources {
+            background: #fff;
+            border: 1px solid #e0e0e0;
+            border-radius: 4px;
+            padding: 15px;
+        }
+        .sources h4 {
+            color: #2c3e50;
+            margin-bottom: 10px;
+        }
+        .source-item {
+            padding: 8px 0;
+            border-bottom: 1px solid #eee;
+        }
+        .source-item:last-child {
+            border-bottom: none;
+        }
+        .documents-list {
+            background: white;
+            border-radius: 6px;
+            border: 1px solid #ddd;
+            max-height: 200px;
+            overflow-y: auto;
+        }
+        .document-item {
+            padding: 12px 15px;
+            border-bottom: 1px solid #eee;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+        }
+        .document-item:last-child {
+            border-bottom: none;
+        }
+        .document-name {
+            font-weight: bold;
+            color: #2c3e50;
+        }
+        .document-chunks {
+            color: #7f8c8d;
+            font-size: 0.9em;
+        }
+        .status-message {
+            padding: 12px;
+            border-radius: 4px;
+            margin: 10px 0;
+            font-weight: bold;
+        }
+        .status-success {
+            background: #d4edda;
+            color: #155724;
+            border: 1px solid #c3e6cb;
+        }
+        .status-error {
+            background: #f8d7da;
+            color: #721c24;
+            border: 1px solid #f5c6cb;
+        }
+        .loading {
+            display: inline-block;
+            width: 20px;
+            height: 20px;
+            border: 3px solid #f3f3f3;
+            border-top: 3px solid #3498db;
+            border-radius: 50%;
+            animation: spin 1s linear infinite;
+            margin-right: 10px;
+        }
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+        .hidden {
+            display: none;
+        }
+        .controls {
+            display: flex;
+            gap: 10px;
+            align-items: center;
+            flex-wrap: wrap;
+        }
+        .no-documents {
+            text-align: center;
+            color: #7f8c8d;
+            padding: 20px;
+            font-style: italic;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <h1>RAG PDF Question Answering System</h1>
+            <p>Upload documents and ask questions to get AI-powered answers</p>
+        </div>
+        <div class="main-content">
+            <div class="section">
+                <h2>📁 Upload Documents</h2>
+                <div class="upload-area">
+                    <p>Select a document to upload (PDF, DOCX, TXT, CSV)</p>
+                    <div class="file-input">
+                        <input type="file" id="fileInput" accept=".pdf,.docx,.txt,.csv">
+                    </div>
+                    <div class="controls">
+                        <button class="btn" onclick="uploadDocument()" id="uploadBtn">Upload Document</button>
+                        <button class="btn btn-danger" onclick="clearAllDocuments()" id="clearBtn">Clear All Documents</button>
+                    </div>
+                </div>
+                <div id="uploadStatus"></div>
+            </div>
+            <div class="section">
+                <h2>📚 Indexed Documents</h2>
+                <div id="documentsList" class="documents-list">
+                    <div class="no-documents">No documents uploaded yet</div>
+                </div>
+            </div>
+            <div class="section">
+                <h2>❓ Ask Questions</h2>
+                <div class="question-area">
+                    <textarea id="questionInput" placeholder="Enter your question about the uploaded documents..."></textarea>
+                    <div style="margin-top: 15px;">
+                        <button class="btn" onclick="askQuestion()" id="askBtn">Ask Question</button>
+                    </div>
+                </div>
+                <div id="answerSection" class="answer-section hidden">
+                    <div id="answerContent" class="answer"></div>
+                    <div id="sourcesContent" class="sources"></div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script>
+        let isUploading = false;
+        let isAsking = false;
+        function showMessage(message, type) {
+            const statusDiv = document.getElementById('uploadStatus');
+            statusDiv.innerHTML = `<div class="status-message status-${type}">${message}</div>`;
+            setTimeout(() => {
+                statusDiv.innerHTML = '';
+            }, 5000);
+        }
+        function setLoadingState(isLoading, buttonId, loadingText, normalText) {
+            const button = document.getElementById(buttonId);
+            if (isLoading) {
+                button.innerHTML = `<span class="loading"></span>${loadingText}`;
+                button.disabled = true;
+            } else {
+                button.innerHTML = normalText;
+                button.disabled = false;
+            }
+        }
+        async function uploadDocument() {
+            const fileInput = document.getElementById('fileInput');
+            const file = fileInput.files[0];
+            if (!file) {
+                showMessage('Please select a file first', 'error');
+                return;
+            }
+            isUploading = true;
+            setLoadingState(true, 'uploadBtn', 'Uploading...', 'Upload Document');
+            const formData = new FormData();
+            formData.append('file', file);
+            try {
+                const response = await fetch('/upload', {
+                    method: 'POST',
+                    body: formData
+                });
+                const result = await response.json();
+                if (response.ok) {
+                    showMessage(result.message, 'success');
+                    fileInput.value = '';
+                    loadDocuments();
+                } else {
+                    showMessage(result.detail || 'Upload failed', 'error');
+                }
+            } catch (error) {
+                showMessage('Network error: ' + error.message, 'error');
+            } finally {
+                isUploading = false;
+                setLoadingState(false, 'uploadBtn', 'Uploading...', 'Upload Document');
+            }
+        }
+        async function askQuestion() {
+            const questionInput = document.getElementById('questionInput');
+            const question = questionInput.value.trim();
+            if (!question) {
+                showMessage('Please enter a question', 'error');
+                return;
+            }
+            isAsking = true;
+            setLoadingState(true, 'askBtn', 'Processing...', 'Ask Question');
+            try {
+                const response = await fetch('/ask', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({ question: question })
+                });
+                const result = await response.json();
+                if (response.ok) {
+                    displayAnswer(result.answer, result.sources);
+                } else {
+                    showMessage(result.detail || 'Failed to get answer', 'error');
+                }
+            } catch (error) {
+                showMessage('Network error: ' + error.message, 'error');
+            } finally {
+                isAsking = false;
+                setLoadingState(false, 'askBtn', 'Processing...', 'Ask Question');
+            }
+        }
+        function displayAnswer(answer, sources) {
+            const answerSection = document.getElementById('answerSection');
+            const answerContent = document.getElementById('answerContent');
+            const sourcesContent = document.getElementById('sourcesContent');
+            answerContent.innerHTML = `<strong>Answer:</strong><br>${answer}`;
+            if (sources && sources.length > 0) {
+                let sourcesHtml = '<h4>Sources:</h4>';
+                sources.forEach((source, index) => {
+                    sourcesHtml += `
+                        <div class="source-item">
+                            <strong>${index + 1}. ${source.document}</strong>
+                            <br><small>Similarity: ${(source.similarity * 100).toFixed(1)}%</small>
+                        </div>
+                    `;
+                });
+                sourcesContent.innerHTML = sourcesHtml;
+            } else {
+                sourcesContent.innerHTML = '<h4>Sources:</h4><p>No sources found</p>';
+            }
+            answerSection.classList.remove('hidden');
+        }
+        async function loadDocuments() {
+            try {
+                const response = await fetch('/documents');
+                const result = await response.json();
+                const documentsList = document.getElementById('documentsList');
+                if (result.documents && result.documents.length > 0) {
+                    let html = '';
+                    result.documents.forEach(doc => {
+                        html += `
+                            <div class="document-item">
+                                <div>
+                                    <div class="document-name">${doc.title}</div>
+                                    <div class="document-chunks">${doc.chunk_count} chunks</div>
+                                </div>
+                            </div>
+                        `;
+                    });
+                    documentsList.innerHTML = html;
+                } else {
+                    documentsList.innerHTML = '<div class="no-documents">No documents uploaded yet</div>';
+                }
+            } catch (error) {
+                console.error('Error loading documents:', error);
+            }
+        }
+        async function clearAllDocuments() {
+            if (!confirm('Are you sure you want to clear all documents? This action cannot be undone.')) {
+                return;
+            }
+            setLoadingState(true, 'clearBtn', 'Clearing...', 'Clear All Documents');
+            try {
+                const response = await fetch('/clear', {
+                    method: 'DELETE'
+                });
+                const result = await response.json();
+                if (response.ok) {
+                    showMessage(result.message, 'success');
+                    loadDocuments();
+                    document.getElementById('answerSection').classList.add('hidden');
+                } else {
+                    showMessage(result.detail || 'Failed to clear documents', 'error');
+                }
+            } catch (error) {
+                showMessage('Network error: ' + error.message, 'error');
+            } finally {
+                setLoadingState(false, 'clearBtn', 'Clearing...', 'Clear All Documents');
+            }
+        }
+        document.getElementById('questionInput').addEventListener('keypress', function(e) {
+            if (e.key === 'Enter' && e.ctrlKey) {
+                askQuestion();
+            }
+        });
+        window.onload = function() {
+            loadDocuments();
+        };
+    </script>
+</body>
+</html> -->
+<!-- perfect index.html -->
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Scholar's Archive - Document Intelligence System</title>
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css" rel="stylesheet">
+    <style>
+        :root {
+            --primary-color: #1a365d;
+            --secondary-color: #2d5a87;
+            --accent-color: #c7955b;
+            --light-bg: #f8f6f0;
+            --cream: #faf8f2;
+            --text-dark: #2c3e50;
+            --text-muted: #718096;
+            --border-color: #e2d8cc;
+            --shadow: 0 4px 20px rgba(26, 54, 93, 0.1);
+            --shadow-hover: 0 8px 30px rgba(26, 54, 93, 0.15);
+        }
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: 'Georgia', 'Times New Roman', serif;
+            line-height: 1.7;
+            background: linear-gradient(135deg, var(--light-bg) 0%, var(--cream) 100%);
+            color: var(--text-dark);
+            min-height: 100vh;
+        }
+        .container {
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 20px;
+        }
+        .header {
+            background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
+            color: white;
+            text-align: center;
+            padding: 3rem 2rem;
+            border-radius: 15px 15px 0 0;
+            box-shadow: var(--shadow);
+            position: relative;
+            overflow: hidden;
+        }
+        .header::before {
+            content: '';
+            position: absolute;
+            top: 0;
+            left: 0;
+            right: 0;
+            bottom: 0;
+            background: url('data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><defs><pattern id="grain" patternUnits="userSpaceOnUse" width="100" height="100"><circle cx="20" cy="20" r="1" fill="rgba(255,255,255,0.05)"/><circle cx="80" cy="40" r="1" fill="rgba(255,255,255,0.03)"/><circle cx="40" cy="80" r="1" fill="rgba(255,255,255,0.04)"/></pattern></defs><rect width="100" height="100" fill="url(%23grain)"/></svg>');
+        }
+        .header-content {
+            position: relative;
+            z-index: 1;
+        }
+        .header h1 {
+            font-size: 2.8rem;
+            margin-bottom: 0.5rem;
+            font-weight: 300;
+            letter-spacing: 1px;
+        }
+        .header .subtitle {
+            font-size: 1.2rem;
+            opacity: 0.9;
+            font-style: italic;
+            margin-bottom: 1rem;
+        }
+        .header .description {
+            font-size: 1rem;
+            opacity: 0.8;
+            max-width: 600px;
+            margin: 0 auto;
+        }
+        .main-content {
+            background: white;
+            border-radius: 0 0 15px 15px;
+            box-shadow: var(--shadow);
+            overflow: hidden;
+        }
+        .section {
+            padding: 2.5rem;
+            border-bottom: 1px solid var(--border-color);
+            position: relative;
+        }
+        .section:last-child {
+            border-bottom: none;
+        }
+        .section-header {
+            display: flex;
+            align-items: center;
+            margin-bottom: 2rem;
+            padding-bottom: 1rem;
+            border-bottom: 2px solid var(--accent-color);
+        }
+        .section-header i {
+            font-size: 1.5rem;
+            color: var(--accent-color);
+            margin-right: 1rem;
+        }
+        .section-header h2 {
+            font-size: 1.8rem;
+            color: var(--primary-color);
+            font-weight: 400;
+        }
+        .upload-zone {
+            border: 2px dashed var(--border-color);
+            border-radius: 12px;
+            padding: 3rem 2rem;
+            text-align: center;
+            background: var(--cream);
+            transition: all 0.3s ease;
+            cursor: pointer;
+            position: relative;
+            overflow: hidden;
+        }
+        .upload-zone:hover {
+            border-color: var(--accent-color);
+            background: white;
+            transform: translateY(-2px);
+            box-shadow: var(--shadow-hover);
+        }
+        .upload-zone.dragover {
+            border-color: var(--primary-color);
+            background: rgba(26, 54, 93, 0.05);
+        }
+        .upload-icon {
+            font-size: 3rem;
+            color: var(--accent-color);
+            margin-bottom: 1rem;
+        }
+        .upload-text {
+            font-size: 1.1rem;
+            color: var(--text-muted);
+            margin-bottom: 1rem;
+        }
+        .file-types {
+            font-size: 0.9rem;
+            color: var(--text-muted);
+            font-style: italic;
+        }
+        .btn {
+            background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
+            color: white;
+            border: none;
+            padding: 0.8rem 2rem;
+            border-radius: 8px;
+            font-family: inherit;
+            font-size: 1rem;
+            cursor: pointer;
+            transition: all 0.3s ease;
+            box-shadow: 0 2px 10px rgba(26, 54, 93, 0.2);
+            position: relative;
+            overflow: hidden;
+        }
+        .btn:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 4px 20px rgba(26, 54, 93, 0.3);
+        }
+        .btn:active {
+            transform: translateY(0);
+        }
+        .btn-secondary {
+            background: linear-gradient(135deg, var(--accent-color) 0%, #d4a574 100%);
+        }
+        .btn-danger {
+            background: linear-gradient(135deg, #c53030 0%, #e53e3e 100%);
+        }
+        .btn:disabled {
+            background: #cbd5e0;
+            color: #a0aec0;
+            cursor: not-allowed;
+            transform: none;
+            box-shadow: none;
+        }
+        .input-group {
+            margin-bottom: 1.5rem;
+        }
+        .form-control {
+            width: 100%;
+            padding: 1rem;
+            border: 2px solid var(--border-color);
+            border-radius: 8px;
+            font-family: inherit;
+            font-size: 1rem;
+            transition: border-color 0.3s ease;
+            background: var(--cream);
+        }
+        .form-control:focus {
+            outline: none;
+            border-color: var(--primary-color);
+            background: white;
+            box-shadow: 0 0 0 3px rgba(26, 54, 93, 0.1);
+        }
+        .question-textarea {
+            min-height: 120px;
+            resize: vertical;
+        }
+        .documents-grid {
+            display: grid;
+            gap: 1rem;
+            margin-top: 1rem;
+        }
+        .document-card {
+            background: var(--cream);
+            border: 1px solid var(--border-color);
+            border-radius: 10px;
+            padding: 1.5rem;
+            transition: all 0.3s ease;
+            position: relative;
+        }
+        .document-card:hover {
+            background: white;
+            box-shadow: var(--shadow);
+            transform: translateY(-2px);
+        }
+        .document-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: flex-start;
+            margin-bottom: 0.5rem;
+        }
+        .document-title {
+            font-weight: 600;
+            color: var(--primary-color);
+            font-size: 1.1rem;
+        }
+        .document-meta {
+            color: var(--text-muted);
+            font-size: 0.9rem;
+        }
+        .answer-container {
+            background: white;
+            border: 1px solid var(--border-color);
+            border-radius: 12px;
+            margin-top: 1.5rem;
+            overflow: hidden;
+            box-shadow: var(--shadow);
+        }
+        .answer-header {
+            background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
+            color: white;
+            padding: 1rem 1.5rem;
+            font-weight: 500;
+        }
+        .answer-content {
+            padding: 2rem;
+            background: var(--cream);
+        }
+        .answer-text {
+            font-size: 1.1rem;
+            line-height: 1.8;
+            margin-bottom: 2rem;
+        }
+        .sources-section {
+            background: white;
+            border-top: 1px solid var(--border-color);
+            padding: 1.5rem;
+        }
+        .sources-title {
+            color: var(--primary-color);
+            font-size: 1.2rem;
+            margin-bottom: 1rem;
+            display: flex;
+            align-items: center;
+        }
+        .sources-title i {
+            margin-right: 0.5rem;
+        }
+        .source-item {
+            background: var(--cream);
+            border: 1px solid var(--border-color);
+            border-radius: 8px;
+            padding: 1rem;
+            margin-bottom: 0.8rem;
+            transition: all 0.3s ease;
+        }
+        .source-item:hover {
+            background: white;
+            box-shadow: 0 2px 10px rgba(26, 54, 93, 0.05);
+        }
+        .source-name {
+            font-weight: 600;
+            color: var(--primary-color);
+            margin-bottom: 0.3rem;
+        }
+        .source-similarity {
+            color: var(--text-muted);
+            font-size: 0.9rem;
+        }
+        .status-message {
+            padding: 1rem 1.5rem;
+            border-radius: 8px;
+            margin: 1rem 0;
+            font-weight: 500;
+            display: flex;
+            align-items: center;
+        }
+        .status-message i {
+            margin-right: 0.5rem;
+        }
+        .status-success {
+            background: #f0fff4;
+            color: #22543d;
+            border: 1px solid #9ae6b4;
+        }
+        .status-error {
+            background: #fed7d7;
+            color: #742a2a;
+            border: 1px solid #fc8181;
+        }
+        .loading-spinner {
+            display: inline-block;
+            width: 20px;
+            height: 20px;
+            border: 2px solid rgba(255, 255, 255, 0.3);
+            border-radius: 50%;
+            border-top-color: white;
+            animation: spin 0.8s linear infinite;
+            margin-right: 0.5rem;
+        }
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+        .controls {
+            display: flex;
+            gap: 1rem;
+            align-items: center;
+            flex-wrap: wrap;
+            margin-top: 1.5rem;
+        }
+        .hidden {
+            display: none;
+        }
+        .empty-state {
+            text-align: center;
+            padding: 2rem;
+            color: var(--text-muted);
+            font-style: italic;
+        }
+        .empty-state i {
+            font-size: 3rem;
+            color: var(--accent-color);
+            margin-bottom: 1rem;
+            display: block;
+        }
+        .stats-bar {
+            background: var(--cream);
+            padding: 1rem 1.5rem;
+            border-radius: 8px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 1.5rem;
+            border: 1px solid var(--border-color);
+        }
+        .stat-item {
+            text-align: center;
+        }
+        .stat-value {
+            font-size: 1.5rem;
+            font-weight: 600;
+            color: var(--primary-color);
+        }
+        .stat-label {
+            font-size: 0.9rem;
+            color: var(--text-muted);
+        }
+        @media (max-width: 768px) {
+            .container {
+                padding: 10px;
+            }
+            .section {
+                padding: 1.5rem;
+            }
+            .header h1 {
+                font-size: 2rem;
+            }
+            .controls {
+                flex-direction: column;
+                align-items: stretch;
+            }
+            .btn {
+                width: 100%;
+            }
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <div class="header-content">
+                <h1><i class="fas fa-university"></i> Scholar's Archive</h1>
+                <p class="subtitle">Document Intelligence System</p>
+                <p class="description">A sophisticated platform for intelligent document analysis and question answering using advanced retrieval-augmented generation technology</p>
+            </div>
+        </div>
+        <div class="main-content">
+            <div class="section">
+                <div class="section-header">
+                    <i class="fas fa-cloud-upload-alt"></i>
+                    <h2>Document Repository</h2>
+                </div>
+                <div class="upload-zone" id="uploadZone">
+                    <div class="upload-icon">
+                        <i class="fas fa-file-upload"></i>
+                    </div>
+                    <div class="upload-text">
+                        <strong>Drop your documents here</strong> or click to browse
+                    </div>
+                    <div class="file-types">
+                        Supported formats: PDF, DOCX, TXT, CSV
+                    </div>
+                    <input type="file" id="fileInput" accept=".pdf,.docx,.txt,.csv" style="display: none;">
+                </div>
+                <div class="controls">
+                    <button class="btn" onclick="uploadDocument()" id="uploadBtn">
+                        <i class="fas fa-upload"></i> Upload Document
+                    </button>
+                    <button class="btn btn-danger" onclick="clearAllDocuments()" id="clearBtn">
+                        <i class="fas fa-trash-alt"></i> Clear Repository
+                    </button>
+                </div>
+                <div id="uploadStatus"></div>
+            </div>
+            <div class="section">
+                <div class="section-header">
+                    <i class="fas fa-books"></i>
+                    <h2>Document Collection</h2>
+                </div>
+                <div class="stats-bar" id="statsBar">
+                    <div class="stat-item">
+                        <div class="stat-value" id="docCount">0</div>
+                        <div class="stat-label">Documents</div>
+                    </div>
+                    <div class="stat-item">
+                        <div class="stat-value" id="chunkCount">0</div>
+                        <div class="stat-label">Text Chunks</div>
+                    </div>
+                    <div class="stat-item">
+                        <div class="stat-value" id="pageCount">0</div>
+                        <div class="stat-label">Total Pages</div>
+                    </div>
+                </div>
+                <div id="documentsList" class="documents-grid">
+                    <div class="empty-state">
+                        <i class="fas fa-folder-open"></i>
+                        <div>No documents in repository</div>
+                    </div>
+                </div>
+            </div>
+            <div class="section">
+                <div class="section-header">
+                    <i class="fas fa-search"></i>
+                    <h2>Intelligent Inquiry</h2>
+                </div>
+                <div class="input-group">
+                    <textarea
+                        id="questionInput"
+                        class="form-control question-textarea"
+                        placeholder="Enter your scholarly inquiry about the uploaded documents..."
+                        rows="4"
+                    ></textarea>
+                </div>
+                <div class="controls">
+                    <button class="btn" onclick="askQuestion()" id="askBtn">
+                        <i class="fas fa-brain"></i> Submit Inquiry
+                    </button>
+                    <button class="btn btn-secondary" onclick="clearAnswer()" id="clearAnswerBtn">
+                        <i class="fas fa-eraser"></i> Clear Response
+                    </button>
+                </div>
+                <div id="answerContainer" class="answer-container hidden">
+                    <div class="answer-header">
+                        <i class="fas fa-lightbulb"></i> Scholarly Response
+                    </div>
+                    <div class="answer-content">
+                        <div id="answerText" class="answer-text"></div>
+                    </div>
+                    <div id="sourcesSection" class="sources-section">
+                        <div class="sources-title">
+                            <i class="fas fa-quote-left"></i> Referenced Sources
+                        </div>
+                        <div id="sourcesList"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script>
+        let isUploading = false;
+        let isAsking = false;
+        document.addEventListener('DOMContentLoaded', function() {
+            loadDocuments();
+            setupEventListeners();
+        });
+        function setupEventListeners() {
+            const uploadZone = document.getElementById('uploadZone');
+            const fileInput = document.getElementById('fileInput');
+            const questionInput = document.getElementById('questionInput');
+            uploadZone.addEventListener('click', () => fileInput.click());
+            uploadZone.addEventListener('dragover', (e) => {
+                e.preventDefault();
+                uploadZone.classList.add('dragover');
+            });
+            uploadZone.addEventListener('dragleave', () => {
+                uploadZone.classList.remove('dragover');
+            });
+            uploadZone.addEventListener('drop', (e) => {
+                e.preventDefault();
+                uploadZone.classList.remove('dragover');
+                const files = e.dataTransfer.files;
+                if (files.length > 0) {
+                    fileInput.files = files;
+                    uploadDocument();
+                }
+            });
+            fileInput.addEventListener('change', uploadDocument);
+            questionInput.addEventListener('keydown', (e) => {
+                if (e.key === 'Enter' && (e.ctrlKey || e.metaKey)) {
+                    askQuestion();
+                }
+            });
+        }
+        function showMessage(message, type, icon = null) {
+            const statusDiv = document.getElementById('uploadStatus');
+            const iconHtml = icon ? `<i class="fas fa-${icon}"></i>` : '';
+            statusDiv.innerHTML = `<div class="status-message status-${type}">${iconHtml}${message}</div>`;
+            setTimeout(() => {
+                statusDiv.innerHTML = '';
+            }, 5000);
+        }
+        function setLoadingState(isLoading, buttonId, loadingText, normalText, normalIcon = null) {
+            const button = document.getElementById(buttonId);
+            if (isLoading) {
+                button.innerHTML = `<span class="loading-spinner"></span>${loadingText}`;
+                button.disabled = true;
+            } else {
+                const iconHtml = normalIcon ? `<i class="fas fa-${normalIcon}"></i> ` : '';
+                button.innerHTML = `${iconHtml}${normalText}`;
+                button.disabled = false;
+            }
+        }
+        async function uploadDocument() {
+            const fileInput = document.getElementById('fileInput');
+            const file = fileInput.files[0];
+            if (!file) return;
+            isUploading = true;
+            setLoadingState(true, 'uploadBtn', 'Processing Document...', 'Upload Document', 'upload');
+            const formData = new FormData();
+            formData.append('file', file);
+            try {
+                const response = await fetch('/upload', {
+                    method: 'POST',
+                    body: formData
+                });
+                const result = await response.json();
+                if (response.ok) {
+                    showMessage(result.message, 'success', 'check-circle');
+                    fileInput.value = '';
+                    await loadDocuments();
+                } else {
+                    showMessage(result.detail || 'Upload failed', 'error', 'exclamation-triangle');
+                }
+            } catch (error) {
+                showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
+            } finally {
+                isUploading = false;
+                setLoadingState(false, 'uploadBtn', 'Processing Document...', 'Upload Document', 'upload');
+            }
+        }
+        async function askQuestion() {
+            const questionInput = document.getElementById('questionInput');
+            const question = questionInput.value.trim();
+            if (!question) {
+                showMessage('Please enter a question', 'error', 'exclamation-triangle');
+                return;
+            }
+            isAsking = true;
+            setLoadingState(true, 'askBtn', 'Analyzing Documents...', 'Submit Inquiry', 'brain');
+            try {
+                const response = await fetch('/ask', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({ question: question })
+                });
+                const result = await response.json();
+                if (response.ok) {
+                    displayAnswer(result.answer, result.sources);
+                } else {
+                    showMessage(result.detail || 'Failed to get answer', 'error', 'exclamation-triangle');
+                }
+            } catch (error) {
+                showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
+            } finally {
+                isAsking = false;
+                setLoadingState(false, 'askBtn', 'Analyzing Documents...', 'Submit Inquiry', 'brain');
+            }
+        }
+        function displayAnswer(answer, sources) {
+            const answerContainer = document.getElementById('answerContainer');
+            const answerText = document.getElementById('answerText');
+            const sourcesList = document.getElementById('sourcesList');
+            answerText.innerHTML = answer;
+            if (sources && sources.length > 0) {
+                let sourcesHtml = '';
+                sources.forEach((source, index) => {
+                    const similarity = Math.round(source.similarity * 100);
+                    sourcesHtml += `
+                        <div class="source-item">
+                            <div class="source-name">
+                                <i class="fas fa-file-alt"></i> ${source.document}
+                            </div>
+                            <div class="source-similarity">
+                                Relevance: ${similarity}% • Chunk ${index + 1}
+                            </div>
+                        </div>
+                    `;
+                });
+                sourcesList.innerHTML = sourcesHtml;
+            } else {
+                sourcesList.innerHTML = `
+                    <div class="empty-state">
+                        <i class="fas fa-search"></i>
+                        <div>No specific sources referenced</div>
+                    </div>
+                `;
+            }
+            answerContainer.classList.remove('hidden');
+            answerContainer.scrollIntoView({ behavior: 'smooth' });
+        }
+        function clearAnswer() {
+            const answerContainer = document.getElementById('answerContainer');
+            answerContainer.classList.add('hidden');
+            document.getElementById('questionInput').value = '';
+        }
+        async function loadDocuments() {
+            try {
+                const [docsResponse, statsResponse] = await Promise.all([
+                    fetch('/documents'),
+                    fetch('/stats')
+                ]);
+                const docsResult = await docsResponse.json();
+                const statsResult = await statsResponse.json();
+                updateDocumentsList(docsResult.documents || []);
+                updateStats(statsResult);
+            } catch (error) {
+                console.error('Error loading documents:', error);
+            }
+        }
+        function updateDocumentsList(documents) {
+            const documentsList = document.getElementById('documentsList');
+            if (documents.length === 0) {
+                documentsList.innerHTML = `
+                    <div class="empty-state">
+                        <i class="fas fa-folder-open"></i>
+                        <div>No documents in repository</div>
+                    </div>
+                `;
+                return;
+            }
+            let html = '';
+            documents.forEach(doc => {
+                html += `
+                    <div class="document-card">
+                        <div class="document-header">
+                            <div class="document-title">
+                                <i class="fas fa-file-alt"></i> ${doc.title}
+                            </div>
+                        </div>
+                        <div class="document-meta">
+                            <i class="fas fa-layer-group"></i> ${doc.chunk_count} chunks
+                            ${doc.total_pages ? ` • <i class="fas fa-file-pdf"></i> ${doc.total_pages} pages` : ''}
+                        </div>
+                    </div>
+                `;
+            });
+            documentsList.innerHTML = html;
+        }
+        function updateStats(stats) {
+            document.getElementById('docCount').textContent = stats.total_documents || 0;
+            document.getElementById('chunkCount').textContent = stats.total_chunks || 0;
+            document.getElementById('pageCount').textContent = stats.total_pages || 0;
+        }
+        async function clearAllDocuments() {
+            if (!confirm('Are you sure you want to clear all documents from the repository? This action cannot be undone.')) {
+                return;
+            }
+            setLoadingState(true, 'clearBtn', 'Clearing Repository...', 'Clear Repository', 'trash-alt');
+            try {
+                const response = await fetch('/clear', {
+                    method: 'DELETE'
+                });
+                const result = await response.json();
+                if (response.ok) {
+                    showMessage(result.message, 'success', 'check-circle');
+                    await loadDocuments();
+                    clearAnswer();
+                } else {
+                    showMessage(result.detail || 'Failed to clear documents', 'error', 'exclamation-triangle');
+                }
+            } catch (error) {
+                showMessage('Network error: ' + error.message, 'error', 'exclamation-triangle');
+            } finally {
+                setLoadingState(false, 'clearBtn', 'Clearing Repository...', 'Clear Repository', 'trash-alt');
+            }
+        }
+    </script>
+</body>
+</html>