Spaces:

jessica45
/

rag

Sleeping

App Files Files Community

jessica45 commited on Oct 19, 2025

Commit

5f04d6e

verified ·

1 Parent(s): e52d28d

updated rag

Browse files

Files changed (5) hide show

embeddings_qdrant.py +382 -0
index_docs.py +101 -0
main.py +196 -0
rag_with_gemini.py +201 -0
requirements.txt +6 -10

embeddings_qdrant.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import os
+import numpy as np
+import google.generativeai as genai
+from dotenv import load_dotenv
+from typing import List, Dict, Optional, Union
+import json
+import pickle
+import uuid
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from qdrant_client.http.models import Distance, VectorParams, PointStruct
+# Load environment variables
+load_dotenv()
+class EmbeddingManager:
+    def __init__(self, api_key: Optional[str] = None):
+        """Initialize the embedding manager with Gemini API."""
+        self.api_key = api_key or os.getenv('GEMINI_API_KEY')
+        if not self.api_key:
+            raise ValueError("GEMINI_API_KEY not found in environment variables")
+        genai.configure(api_key=self.api_key)
+        self.model_name = "models/text-embedding-004"
+    def generate_embedding(self, text: str) -> np.ndarray:
+        """Generate embedding for a single text."""
+        try:
+            result = genai.embed_content(
+                model=self.model_name,
+                content=text,
+                task_type="retrieval_document"
+            )
+            return np.array(result['embedding'], dtype=np.float32)
+        except Exception as e:
+            print(f"Error generating embedding: {e}")
+            return np.array([])
+    def generate_embeddings_batch(self, texts: List[str]) -> List[np.ndarray]:
+        """Generate embeddings for multiple texts."""
+        embeddings = []
+        for i, text in enumerate(texts):
+            print(f"Generating embedding {i+1}/{len(texts)}")
+            embedding = self.generate_embedding(text)
+            if embedding.size > 0:
+                embeddings.append(embedding)
+            else:
+                print(f"Failed to generate embedding for text {i+1}")
+        return embeddings
+    def generate_query_embedding(self, query: str) -> np.ndarray:
+        """Generate embedding for a query (search)."""
+        try:
+            result = genai.embed_content(
+                model=self.model_name,
+                content=query,
+                task_type="retrieval_query"
+            )
+            return np.array(result['embedding'], dtype=np.float32)
+        except Exception as e:
+            print(f"Error generating query embedding: {e}")
+            return np.array([])
+class QdrantVectorStore:
+    def __init__(self, collection_name: Optional[str] = None, url: Optional[str] = None, api_key: Optional[str] = None):
+        """Initialize Qdrant vector store."""
+        self.collection_name = collection_name or os.getenv('QDRANT_COLLECTION_NAME', 'rag_documents')
+        # Get Qdrant configuration from environment
+        qdrant_url = url or os.getenv('QDRANT_URL')
+        qdrant_api_key = api_key or os.getenv('QDRANT_API_KEY')
+        # Initialize Qdrant client
+        if qdrant_url and qdrant_api_key:
+            # Qdrant Cloud
+            print(f"Connecting to Qdrant Cloud at {qdrant_url}")
+            self.client = QdrantClient(
+                url=qdrant_url,
+                api_key=qdrant_api_key,
+            )
+        else:
+            # Local Qdrant (default)
+            print("Using local Qdrant instance at http://localhost:6333")
+            self.client = QdrantClient("localhost", port=6333)
+        self.embedding_dim = 768  # Gemini embedding dimension
+    def create_collection(self, force_recreate: bool = False):
+        """Create or recreate the collection."""
+        try:
+            # Check if collection exists
+            collections = self.client.get_collections().collections
+            collection_exists = any(col.name == self.collection_name for col in collections)
+            if collection_exists and force_recreate:
+                print(f"Deleting existing collection: {self.collection_name}")
+                self.client.delete_collection(collection_name=self.collection_name)
+                collection_exists = False
+            if not collection_exists:
+                print(f"Creating collection: {self.collection_name}")
+                self.client.create_collection(
+                    collection_name=self.collection_name,
+                    vectors_config=VectorParams(size=self.embedding_dim, distance=Distance.COSINE),
+                )
+                print(f"✓ Collection '{self.collection_name}' created successfully")
+            else:
+                print(f"✓ Collection '{self.collection_name}' already exists")
+        except Exception as e:
+            print(f"Error creating collection: {e}")
+            raise
+    def add_documents(self, chunks: List[str], embeddings: List[np.ndarray], metadata: List[Dict] = None, session_id: Optional[str] = None):
+        """Add documents with their embeddings to Qdrant.
+        Args:
+            chunks: list of text chunks
+            embeddings: list of numpy embeddings corresponding to chunks
+            metadata: optional list of dicts with metadata per chunk
+            session_id: optional session identifier to attach to each point payload
+        """
+        if metadata is None:
+            metadata = [{"index": i} for i in range(len(chunks))]
+        if len(chunks) != len(embeddings) or len(chunks) != len(metadata):
+            raise ValueError("chunks, embeddings, and metadata must have the same length")
+        # Ensure collection exists
+        self.create_collection()
+        # Prepare points for Qdrant
+        points = []
+        for i, (chunk, embedding, meta) in enumerate(zip(chunks, embeddings, metadata)):
+            point_id = str(uuid.uuid4())
+            # Combine text and metadata for payload
+            payload = {
+                "text": chunk,
+                "metadata": meta
+            }
+            # Attach session info if provided
+            if session_id is not None:
+                payload["session_id"] = session_id
+            point = PointStruct(
+                id=point_id,
+                vector=embedding.tolist(),
+                payload=payload
+            )
+            points.append(point)
+        # Upload points to Qdrant
+        try:
+            print(f"Uploading {len(points)} documents to Qdrant...")
+            self.client.upsert(
+                collection_name=self.collection_name,
+                points=points
+            )
+            print(f"✓ Successfully uploaded {len(points)} documents")
+        except Exception as e:
+            print(f"Error uploading documents: {e}")
+            raise
+    def similarity_search(self, query_embedding: np.ndarray, top_k: int = 5, score_threshold: float = 0.0,
+                         include_context: bool = False) -> List[Dict]:
+        """
+        Search for similar documents in Qdrant.
+        Args:
+            query_embedding: The query vector
+            top_k: Number of results to return
+            score_threshold: Minimum similarity score
+            include_context: If True, try to include surrounding chunks for context
+        """
+        try:
+            search_results = self.client.search(
+                collection_name=self.collection_name,
+                query_vector=query_embedding.tolist(),
+                limit=top_k,
+                score_threshold=score_threshold
+            )
+            results = []
+            for hit in search_results:
+                metadata = hit.payload['metadata']
+                # Basic result structure
+                result = {
+                    'id': hit.id,
+                    'similarity': hit.score,
+                    'chunk': hit.payload['text'],
+                    'metadata': metadata,
+                    'source': {
+                        'file_name': metadata.get('file_name', 'Unknown'),
+                        'file_path': metadata.get('file_path', 'Unknown'),
+                        'chunk_index': metadata.get('chunk_index', 0)
+                    }
+                }
+                # Add context if requested
+                if include_context:
+                    result['context'] = self._get_surrounding_context(metadata)
+                # Add citation format
+                result['citation'] = f"{metadata.get('file_name', 'Unknown')} (chunk {metadata.get('chunk_index', 0)})"
+                results.append(result)
+            return results
+        except Exception as e:
+            print(f"Error searching documents: {e}")
+            return []
+    def _get_surrounding_context(self, metadata: Dict) -> Dict:
+        """Get surrounding chunks for context (if available)."""
+        try:
+            file_path = metadata.get('file_path')
+            chunk_index = metadata.get('chunk_index', 0)
+            # Try to find adjacent chunks from the same file
+            context_filter = {
+                "must": [
+                    {"key": "metadata.file_path", "match": {"value": file_path}}
+                ]
+            }
+            # Search for chunks from same file
+            context_results = self.client.search(
+                collection_name=self.collection_name,
+                query_vector=[0.0] * self.embedding_dim,  # Dummy vector
+                query_filter=context_filter,
+                limit=10,
+                score_threshold=0.0
+            )
+            # Sort by chunk index and get surrounding chunks
+            file_chunks = []
+            for hit in context_results:
+                hit_metadata = hit.payload['metadata']
+                if hit_metadata.get('chunk_index') is not None:
+                    file_chunks.append({
+                        'index': hit_metadata['chunk_index'],
+                        'text': hit.payload['text']
+                    })
+            file_chunks.sort(key=lambda x: x['index'])
+            # Find current chunk and get neighbors
+            current_idx = None
+            for i, chunk in enumerate(file_chunks):
+                if chunk['index'] == chunk_index:
+                    current_idx = i
+                    break
+            context = {
+                'previous_chunk': file_chunks[current_idx - 1]['text'] if current_idx and current_idx > 0 else None,
+                'next_chunk': file_chunks[current_idx + 1]['text'] if current_idx is not None and current_idx < len(file_chunks) - 1 else None,
+                'total_chunks_in_file': len(file_chunks)
+            }
+            return context
+        except Exception as e:
+            print(f"Error getting context: {e}")
+            return {'error': 'Could not retrieve context'}
+    def get_relevant_passages(self, query_embedding: np.ndarray, top_k: int = 5) -> List[str]:
+        """Return just the text passages for RAG prompt creation."""
+        results = self.similarity_search(query_embedding, top_k)
+        return [result['chunk'] for result in results if result['chunk']]
+    def enhanced_search(self, query_embedding: np.ndarray, top_k: int = 5) -> str:
+        """Return a formatted string with search results ready for RAG."""
+        results = self.similarity_search(query_embedding, top_k, include_context=True)
+        if not results:
+            return "No relevant documents found."
+        formatted_results = []
+        for i, result in enumerate(results, 1):
+            formatted_result = f"""
+**Result {i}** (Similarity: {result['similarity']:.3f})
+**Source**: {result['citation']}
+**Content**: {result['chunk']}
+"""
+            # Add context if available
+            if 'context' in result and not result['context'].get('error'):
+                context = result['context']
+                if context.get('previous_chunk'):
+                    formatted_result += f"\n**Previous Context**: ...{context['previous_chunk'][-100:]}"
+                if context.get('next_chunk'):
+                    formatted_result += f"\n**Following Context**: {context['next_chunk'][:100]}..."
+            formatted_results.append(formatted_result)
+        return "\n" + "="*50 + "\n".join(formatted_results)
+    def get_collection_info(self) -> Dict:
+        """Get information about the collection."""
+        try:
+            info = self.client.get_collection(collection_name=self.collection_name)
+            return {
+                'name': self.collection_name,
+                'points_count': info.points_count,
+                'vectors_count': info.vectors_count,
+                'status': info.status
+            }
+        except Exception as e:
+            print(f"Error getting collection info: {e}")
+            return {}
+    def delete_collection(self):
+        """Delete the collection."""
+        try:
+            self.client.delete_collection(collection_name=self.collection_name)
+            print(f"✓ Collection '{self.collection_name}' deleted")
+        except Exception as e:
+            print(f"Error deleting collection: {e}")
+if __name__ == "__main__":
+    # Example usage with Qdrant
+    print("Testing Qdrant Vector Store...")
+    try:
+        embedding_manager = EmbeddingManager()
+        qdrant_store = QdrantVectorStore()
+        # Test with sample texts
+        sample_texts = [
+            "This is a sample document about machine learning and artificial intelligence.",
+            "Python is a great programming language for data science and AI development.",
+            "Qdrant is a vector database that enables similarity search at scale."
+        ]
+        print("Generating embeddings...")
+        embeddings = embedding_manager.generate_embeddings_batch(sample_texts)
+        if embeddings:
+            # Create metadata
+            metadata = [
+                {"source": "sample_doc", "topic": "machine_learning", "index": 0},
+                {"source": "sample_doc", "topic": "programming", "index": 1},
+                {"source": "sample_doc", "topic": "database", "index": 2}
+            ]
+            # Add to Qdrant
+            qdrant_store.add_documents(sample_texts, embeddings, metadata)
+            # Test search - Basic
+            query = "What is vector database?"
+            query_embedding = embedding_manager.generate_query_embedding(query)
+            if query_embedding.size > 0:
+                print(f"\n🔍 BASIC SEARCH: {query}")
+                results = qdrant_store.similarity_search(query_embedding, top_k=2)
+                for result in results:
+                    print(f"Similarity: {result['similarity']:.4f}")
+                    print(f"Source: {result['citation']}")
+                    print(f"Text: {result['chunk']}")
+                    print(f"Topic: {result['metadata']['topic']}")
+                    print("---")
+                # Test enhanced search
+                print(f"\n🚀 ENHANCED SEARCH (RAG-ready format):")
+                enhanced_results = qdrant_store.enhanced_search(query_embedding, top_k=2)
+                print(enhanced_results)
+                # Show collection info
+                info = qdrant_store.get_collection_info()
+                print(f"\nCollection Info: {info}")
+    except Exception as e:
+        print(f"Error in test: {e}")
+        print("Make sure:")
+        print("1. Your GEMINI_API_KEY is valid in .env file")
+        print("2. Qdrant is running (docker run -p 6333:6333 qdrant/qdrant) or configure Qdrant Cloud")

index_docs.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from typing import Optional
+from docx import Document
+try:
+    import fitz  # PyMuPDF
+except Exception:
+    # fall back to pymupdf module name if present
+    import pymupdf as fitz
+def load_pdf_text(file_path: str) -> str:
+    try:
+        doc = fitz.open(file_path)
+        text = ""
+        # iterate directly over pages
+        for page in doc:
+            # use standard PyMuPDF API
+            try:
+                page_text = page.get_text()
+            except Exception:
+                # try alternate name for older versions
+                page_text = page.getText() if hasattr(page, 'getText') else ''
+            if page_text:
+                text += page_text + "\n"
+        try:
+            doc.close()
+        except Exception:
+            pass
+        return text.strip()
+    except Exception as e:
+        print(f"Error reading PDF {file_path}: {e}")
+        return ""
+def load_docx_text(file_path: str) -> str:
+    try:
+        doc = Document(file_path)
+        paragraphs = [p.text for p in doc.paragraphs if p.text]
+        return "\n".join(paragraphs).strip()
+    except Exception as e:
+        print(f"Error reading DOCX {file_path}: {e}")
+        return ""
+def load_txt_text(file_path: str) -> str:
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    except Exception as e:
+        print(f"Error reading TXT {file_path}: {e}")
+        return ""
+def extract_text_from_path(path: str) -> Optional[str]:
+    if path.lower().endswith('.pdf'):
+        return load_pdf_text(path)
+    if path.lower().endswith('.docx'):
+        return load_docx_text(path)
+    if path.lower().endswith('.txt'):
+        return load_txt_text(path)
+    return None
+def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list:
+    chunks = []
+    start = 0
+    text_length = len(text)
+    while start < text_length:
+        end = min(start + chunk_size, text_length)
+        chunk = text[start:end]
+        chunks.append(chunk)
+        start += chunk_size - overlap
+    return chunks
+if __name__ == '__main__':
+    import sys
+    def usage():
+        print('Usage: python src/index_docs.py <path-to-file-or-folder> [chunk_size]')
+    if len(sys.argv) < 2:
+        usage()
+        sys.exit(1)
+    path = sys.argv[1]
+    chunk_size = int(sys.argv[2]) if len(sys.argv) > 2 else 500
+    print(f'Testing extraction for: {path}')
+    text = extract_text_from_path(path)
+    if not text:
+        print('No text extracted or unsupported file type.')
+        sys.exit(1)
+    print('Characters extracted:', len(text))
+    chunks = chunk_text(text, chunk_size=chunk_size)
+    print('Chunks produced:', len(chunks))
+    if chunks:
+        preview = 300
+        print('\n--- First chunk preview ---')
+        print(chunks[0][:preview])
+        print('\n--- Second chunk preview ---')
+        print(chunks[1][:preview] if len(chunks) > 1 else '<none>')

main.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import streamlit as st
+import os
+import tempfile
+import hashlib
+from typing import List
+from dotenv import load_dotenv
+from rag_with_gemini import RAGSystem
+# Load environment variables
+load_dotenv()
+# --- PAGE CONFIG ---
+st.set_page_config(
+    page_title="RAG Document Assistant",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# --- SESSION STATE INIT ---
+def initialize_session_state():
+    if 'rag_system' not in st.session_state:
+        st.session_state.rag_system = None
+    if 'documents_processed' not in st.session_state:
+        st.session_state.documents_processed = []
+    # store SHA256 hashes of processed files to avoid reprocessing the same file in a session
+    if 'processed_hashes' not in st.session_state:
+        st.session_state.processed_hashes = set()
+    if 'chat_history' not in st.session_state:
+        st.session_state.chat_history = []
+    if 'processing_status' not in st.session_state:
+        st.session_state.processing_status = ""
+    if 'system_initialized' not in st.session_state:
+        st.session_state.system_initialized = False
+# --- RAG SYSTEM INIT ---
+def initialize_rag_system():
+    if st.session_state.system_initialized:
+        return True
+    try:
+        gemini_api_key = os.getenv('GEMINI_API_KEY')
+        qdrant_url = os.getenv('QDRANT_URL')
+        qdrant_api_key = os.getenv('QDRANT_API_KEY')
+        if not gemini_api_key or not qdrant_url or not qdrant_api_key:
+            st.error("❌ Missing API keys in your .env file.")
+            return False
+        with st.spinner("🚀 Initializing RAG system..."):
+            rag_system = RAGSystem(gemini_api_key, qdrant_url, qdrant_api_key)
+            st.session_state.rag_system = rag_system
+            st.session_state.system_initialized = True
+            return True
+    except Exception as e:
+        st.error(f"❌ Initialization error: {e}")
+        return False
+# --- DOCUMENT PROCESSING ---
+def process_uploaded_files(uploaded_files):
+    if not uploaded_files or not st.session_state.rag_system:
+        return False
+    try:
+        temp_paths = []
+        to_process = []
+        skipped = []
+        # Determine which files are new by hashing contents
+        for uploaded_file in uploaded_files:
+            data = uploaded_file.getvalue()
+            h = hashlib.sha256(data).hexdigest()
+            if h in st.session_state.processed_hashes:
+                skipped.append(uploaded_file.name)
+                continue
+            # write temp file for processing
+            with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp:
+                tmp.write(data)
+                temp_paths.append(tmp.name)
+            to_process.append((uploaded_file.name, h))
+        # If there are no new files to process, short-circuit
+        if not temp_paths:
+            st.session_state.processing_status = f"⚠️ No new files to process. Skipped: {', '.join(skipped)}" if skipped else "⚠️ No files provided."
+            return True
+        with st.spinner("📄 Processing documents..."):
+            success = st.session_state.rag_system.add_documents(temp_paths)
+        for path in temp_paths:
+            try:
+                os.unlink(path)
+            except:
+                pass
+        if success:
+            # record processed filenames and their hashes
+            for name, h in to_process:
+                st.session_state.documents_processed.append(name)
+                st.session_state.processed_hashes.add(h)
+            # if some were skipped, include that in the status
+            status_msg = f"✅ Processed {len(to_process)} documents!"
+            if skipped:
+                status_msg += f" Skipped {len(skipped)} duplicate(s): {', '.join(skipped)}"
+            st.session_state.processing_status = status_msg
+            return True
+        else:
+            st.session_state.processing_status = "❌ Failed to process documents."
+            return False
+    except Exception as e:
+        st.session_state.processing_status = f"❌ Error: {str(e)}"
+        return False
+# --- CHAT DISPLAY ---
+def display_chat_message(role: str, content: str, sources: List[str] = None):
+    avatar_url = (
+        "https://cdn-icons-png.flaticon.com/512/4712/4712035.png"
+        if role == "assistant"
+        else "https://cdn-icons-png.flaticon.com/512/1077/1077012.png"
+    )
+    with st.chat_message(role, avatar=avatar_url):
+        st.markdown(content)
+# --- MAIN ---
+def main():
+    initialize_session_state()
+    st.markdown('<h1 class="main-header">RAG Document Assistant</h1>', unsafe_allow_html=True)
+    if not initialize_rag_system():
+        st.stop()
+    # Sidebar
+    with st.sidebar:
+        st.markdown("### 📁 Upload Documents")
+        uploaded_files = st.file_uploader("Choose files", type=['pdf', 'txt', 'docx'], accept_multiple_files=True)
+        if uploaded_files and st.button("📤 Process Documents"):
+            if process_uploaded_files(uploaded_files):
+                st.rerun()
+        if st.session_state.processing_status:
+            msg = st.session_state.processing_status
+            cls = "success-message" if "✅" in msg else "error-message"
+            st.markdown(f'<div class="{cls}">{msg}</div>', unsafe_allow_html=True)
+        if st.session_state.documents_processed:
+            st.markdown("### ✅ Processed Files")
+            for doc in st.session_state.documents_processed:
+                st.write(f"- {doc}")
+        if st.button("🗑️ Clear Chat"):
+            st.session_state.chat_history = []
+            st.rerun()
+    if not st.session_state.chat_history and not st.session_state.documents_processed:
+        st.markdown("""
+        <div style="text-align:center; padding:3rem; color:#9ca3af;">
+            <h3>👋 Welcome to your RAG Assistant</h3>
+            <p>Upload documents in the sidebar, then ask me anything about their content.</p>
+        </div>
+        """, unsafe_allow_html=True)
+    for message in st.session_state.chat_history:
+        display_chat_message(message["role"], message["content"], message.get("sources", []))
+    # Chat input
+    if prompt := st.chat_input("💬 Ask me anything..."):
+        if not st.session_state.documents_processed:
+            st.warning("⚠️ Upload and process documents first!")
+            return
+        st.session_state.chat_history.append({"role": "user", "content": prompt})
+        display_chat_message("user", prompt)
+        with st.chat_message("assistant"):
+            with st.spinner("🤔 Thinking..."):
+                try:
+                    result = st.session_state.rag_system.query(prompt)
+                    st.markdown(result['answer'])
+                    # if result['sources']:
+                    #     with st.expander("📚 Sources"):
+                    #         for i, src in enumerate(result['sources'], 1):
+                    #             st.write(f"{i}. {os.path.basename(src)}")
+                    st.session_state.chat_history.append({
+                        "role": "assistant",
+                        "content": result['answer'],
+                        "sources": result['sources']
+                    })
+                except Exception as e:
+                    error_msg = f"❌ Error: {str(e)}"
+                    st.error(error_msg)
+                    st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
+if __name__ == "__main__":
+    main()

rag_with_gemini.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+RAG (Retrieval-Augmented Generation) system with Gemini
+"""
+import google.generativeai as genai
+import time
+import logging
+from typing import List, Dict, Any, Optional
+from embeddings_qdrant import EmbeddingManager, QdrantVectorStore
+from index_docs import extract_text_from_path, chunk_text
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class RAGSystem:
+    """Complete RAG system with Gemini AI"""
+    def __init__(self, gemini_api_key: str, qdrant_url: str, qdrant_api_key: str):
+        """Initialize RAG system with Gemini and Qdrant"""
+        # Configure Gemini
+        genai.configure(api_key=gemini_api_key)
+        self.model = genai.GenerativeModel("models/gemini-2.5-flash")
+        # Initialize components
+        self.embedding_manager = EmbeddingManager(gemini_api_key)
+        # Try Qdrant Cloud first, fallback to simple vector store
+        try:
+            self.vector_store = QdrantVectorStore(url=qdrant_url, api_key=qdrant_api_key)
+            self.vector_store.create_collection(force_recreate=True)
+            logger.info("✅ Connected to Qdrant Cloud")
+            self.using_qdrant = True
+        except Exception as e:
+            logger.warning(f"❌ Qdrant Cloud connection failed: {e}")
+            logger.info("🔄 Falling back to simple vector store")
+            self.vector_store.create_collection()
+            self.using_qdrant = False
+    def add_documents(self, file_paths: List[str], session_id: Optional[str] = None) -> bool:
+        """Add documents to the vector store"""
+        try:
+            all_chunks = []
+            for file_path in file_paths:
+                logger.info(f"Processing {file_path}")
+                # Extract text
+                text = extract_text_from_path(file_path)
+                if not text:
+                    logger.warning(f"No text extracted from {file_path}")
+                    continue
+                # Chunk text
+                chunks = chunk_text(text)
+                # Add metadata
+                for chunk in chunks:
+                    all_chunks.append({
+                        'text': chunk,
+                        'source': file_path,
+                        'chunk_id': len(all_chunks)
+                    })
+            if not all_chunks:
+                logger.error("No chunks to process")
+                return False
+            # Generate embeddings
+            logger.info(f"Generating embeddings for {len(all_chunks)} chunks")
+            embeddings = []
+            texts = []
+            metadata_list = []
+            for i, chunk in enumerate(all_chunks):
+                try:
+                    # Generate embedding
+                    embedding = self.embedding_manager.generate_embedding(chunk['text'])
+                    embeddings.append(embedding)
+                    texts.append(chunk['text'])
+                    metadata_list.append({
+                        'source': chunk['source'],
+                        'chunk_id': chunk['chunk_id']
+                    })
+                    logger.info(f"Generated embedding {i+1}/{len(all_chunks)}")
+                    # Small delay to avoid rate limits
+                    time.sleep(0.1)
+                except Exception as e:
+                    logger.error(f"Error processing chunk {i}: {e}")
+                    continue
+            # Store all embeddings in vector database
+            if embeddings and texts:
+                logger.info(f"Storing {len(embeddings)} embeddings in vector database (session={session_id})")
+                # Forward session_id so it is stored with each point
+                self.vector_store.add_documents(texts, embeddings, metadata_list, session_id=session_id)
+            logger.info("Document processing completed successfully!")
+            return True
+        except Exception as e:
+            logger.error(f"Error adding documents: {e}")
+            return False
+    def make_rag_prompt(self, query: str, context_passages: List[str]) -> str:
+        """Create RAG prompt with the user's specified format"""
+        context = "\n\n".join([f"Context {i+1}: {passage}" for i, passage in enumerate(context_passages)])
+        prompt = f"""You are a helpful assistant. Answer the user's question based on the provided context. If the context doesn't contain enough information to answer the question, say so clearly.
+Context:
+{context}
+Question: {query}
+Answer:"""
+        return prompt
+    def generate_answer(self, prompt: str, max_retries: int = 3) -> str:
+        """Generate answer using Gemini with retry logic"""
+        for attempt in range(max_retries):
+            try:
+                response = self.model.generate_content(prompt)
+                if response and response.text:
+                    return response.text.strip()
+                else:
+                    logger.warning(f"Empty response on attempt {attempt + 1}")
+            except Exception as e:
+                logger.error(f"Error generating answer (attempt {attempt + 1}): {e}")
+                if "429" in str(e) or "quota" in str(e).lower():
+                    if attempt < max_retries - 1:
+                        wait_time = (2 ** attempt) * 2  # Exponential backoff
+                        logger.info(f"Rate limit hit, waiting {wait_time} seconds...")
+                        time.sleep(wait_time)
+                    else:
+                        return "I'm sorry, I'm currently experiencing high demand. Please try again in a few minutes."
+                elif attempt < max_retries - 1:
+                    time.sleep(1)
+                else:
+                    return f"I encountered an error while generating the answer: {str(e)}"
+        return "I'm sorry, I couldn't generate an answer at this time. Please try again."
+    def query(self, question: str, top_k: int = 3) -> Dict[str, Any]:
+        """Handle complete RAG query process"""
+        try:
+            logger.info(f"Processing query: {question}")
+            # Generate query embedding
+            query_embedding = self.embedding_manager.generate_embedding(question)
+            # Search for relevant passages
+            search_results = self.vector_store.similarity_search(
+                query_embedding=query_embedding,
+                top_k=top_k
+            )
+            if not search_results:
+                return {
+                    'answer': "I couldn't find relevant information to answer your question.",
+                    'sources': [],
+                    'context_used': []
+                }
+            # Extract context passages and sources
+            context_passages = [result.get('chunk', '') for result in search_results]
+            sources = [result.get('metadata', {}).get('source', 'Unknown') for result in search_results]
+            # Create RAG prompt
+            rag_prompt = self.make_rag_prompt(question, context_passages)
+            # Generate answer
+            answer = self.generate_answer(rag_prompt)
+            return {
+                'answer': answer,
+                'sources': list(set(sources)),  # Remove duplicates
+                'context_used': context_passages
+            }
+        except Exception as e:
+            logger.error(f"Error in query processing: {e}")
+            return {
+                'answer': f"I encountered an error while processing your question: {str(e)}",
+                'sources': [],
+                'context_used': []
+            }
+def handle_query(rag_system: RAGSystem, query: str) -> Dict[str, Any]:
+    """Handle a single query through the RAG system"""
+    return rag_system.query(query)

requirements.txt CHANGED Viewed

@@ -1,12 +1,8 @@
 google-generativeai>=0.3.0
-chromadb>=0.4.0
-pdfplumber
-pip<24.1
 python-docx>=0.8.11
-langchain>=0.1.0
-numpy>=1.21.0
-python-dotenv>=0.19.0
-streamlit>=1.18.0
-typing-extensions>=3.7.4
-tika
-pymupdf

 google-generativeai>=0.3.0
+python-dotenv>=1.0.0
+numpy>=1.24.0
+pypdf>=3.0.0
 python-docx>=0.8.11
+qdrant-client>=1.7.0
+streamlit>=1.28.0
+scikit-learn>=1.3.0