Spaces:

abhinav0231
/

SparrowTale

Sleeping

App Files Files Community

abhinav0231 commited on Sep 5, 2025

Commit

d1ebc00

verified ·

1 Parent(s): 7367624

Update rag_agent.py

Browse files

Files changed (1) hide show

rag_agent.py +75 -66

rag_agent.py CHANGED Viewed

@@ -2,92 +2,101 @@ import os
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
-import PyPDF2
-from langchain_google_genai import ChatGoogleGenerativeAI
 import streamlit as st
-def load_document(file_path: str) -> str:
-    """Load document content from PDF or TXT file."""
-    try:
-        if file_path.endswith(".pdf"):
-            with open(file_path, 'rb') as file:
-                pdf_reader = PyPDF2.PdfReader(file)
-                text = ""
-                for page in pdf_reader.pages:
-                    text += page.extract_text() + "\n"
-                return text
-        elif file_path.endswith(".txt"):
-            with open(file_path, 'r', encoding='utf-8') as file:
-                return file.read()
-        else:
-            return "Error: Unsupported file format."
-    except Exception as e:
-        return f"Error reading file: {str(e)}"
-def simple_text_search(query: str, document_text: str, max_chunks: int = 3) -> str:
-    """Simple TF-IDF based text retrieval - much faster than FAISS."""
-    # Split document into chunks
-    chunks = []
-    words = document_text.split()
-    chunk_size = 200  # words per chunk
-    for i in range(0, len(words), chunk_size):
-        chunk = " ".join(words[i:i + chunk_size])
-        if chunk.strip():
-            chunks.append(chunk)
-    if not chunks:
-        return "No content found in document."
-    # Create TF-IDF vectors
-    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
     try:
-        # Vectorize chunks and query
-        chunk_vectors = vectorizer.fit_transform(chunks)
         query_vector = vectorizer.transform([query])
-        # Calculate similarity
-        similarities = cosine_similarity(query_vector, chunk_vectors).flatten()
-        # Get top matching chunks
-        top_indices = similarities.argsort()[-max_chunks:][::-1]
-        relevant_chunks = [chunks[i] for i in top_indices if similarities[i] > 0.1]
-        return "\n\n".join(relevant_chunks[:max_chunks])
     except Exception as e:
-        return f"Search error: {str(e)}"
 def run_rag_agent(user_prompt: str, file_path: str) -> str:
-    """Simple but effective RAG implementation."""
-    print("--- RAG Agent Activated (Lightweight Version) ---")
-    # Load document
-    document_text = load_document(file_path)
-    if document_text.startswith("Error"):
-        return document_text
-    # Generate search query using LLM
-    api_key = st.secrets.get("GEMINI_API_KEY", os.getenv("GEMINI_API_KEY"))
-    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=api_key)
-    search_prompt = f"""Based on this story idea: "{user_prompt}"
-What are the 2-3 most important keywords to search for in a document to find relevant context?
-Respond with just the keywords separated by spaces."""
     try:
         response = llm.invoke(search_prompt)
         search_query = response.content.strip()
         print(f"Generated Search Query: {search_query}")
-    except:
-        search_query = user_prompt  # Fallback
-    # Retrieve relevant content
-    context = simple_text_search(search_query, document_text)
     print("--- RAG Agent Finished ---")
-    return context

 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
+from typing import List, Dict
 import streamlit as st
+def get_document_context(file_path: str, query: str) -> str:
+    """
+    Lightweight document retrieval using TF-IDF instead of FAISS.
+    """
+    print("--- Using TF-IDF for document retrieval ---")
+    # Load document
+    if file_path.endswith(".pdf"):
+        try:
+            from pypdf import PdfReader
+            reader = PdfReader(file_path)
+            documents = []
+            for page in reader.pages:
+                text = page.extract_text()
+                if text.strip():
+                    documents.append(text)
+        except:
+            return "Error: Could not read PDF file."
+    elif file_path.endswith(".txt"):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                # Split into chunks of ~1000 characters
+                documents = [content[i:i+1000] for i in range(0, len(content), 800)]
+        except:
+            return "Error: Could not read text file."
+    else:
+        return "Error: Unsupported file format. Please upload a .pdf or .txt file."
+    if not documents:
+        return "Error: Document is empty or could not be read."
     try:
+        # Create TF-IDF vectors - this is our "embedding" replacement
+        vectorizer = TfidfVectorizer(
+            stop_words='english',
+            max_features=5000,
+            ngram_range=(1, 2)  # Include bigrams for better context
+        )
+        # Transform documents and query
+        doc_vectors = vectorizer.fit_transform(documents)
         query_vector = vectorizer.transform([query])
+        # Calculate similarities
+        similarities = cosine_similarity(query_vector, doc_vectors).flatten()
+        # Get top 3 most relevant chunks
+        top_indices = similarities.argsort()[-3:][::-1]
+        context_chunks = []
+        for idx in top_indices:
+            if similarities[idx] > 0.1:  # Only include if reasonably relevant
+                context_chunks.append(documents[idx])
+        context = "\n\n".join(context_chunks)
+        return context if context else "No relevant context found in the document."
     except Exception as e:
+        print(f"An error occurred during document processing: {e}")
+        return "Error: Failed to process the provided document."
 def run_rag_agent(user_prompt: str, file_path: str) -> str:
+    """
+    The main agentic function - keep the same interface as before.
+    """
+    print("--- RAG Agent Activated (Lightweight TF-IDF Mode) ---")
+    # Generate optimized search query using LLM (same logic as before)
+    from llm_setup import llm
+    if not llm:
+        return "Error: LLM not available for query generation."
     try:
+        search_prompt = f"""You are a research assistant. Based on the user's story idea, what is the single most
+important keyword or question to search for within their provided document to find relevant context?
+User's Story Idea: '{user_prompt}'
+Optimized Search Query for Document:"""
         response = llm.invoke(search_prompt)
         search_query = response.content.strip()
         print(f"Generated Search Query: {search_query}")
+    except Exception as e:
+        print(f"Query generation failed, using original prompt: {e}")
+        search_query = user_prompt
+    # Use our lightweight retrieval
+    context = get_document_context(file_path, search_query)
     print("--- RAG Agent Finished ---")
+    return context