File size: 3,672 Bytes
ab56328
 
 
 
d1ebc00
ab56328
 
d1ebc00
 
 
 
 
ab56328
d1ebc00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab56328
d1ebc00
 
 
 
 
 
 
 
 
ab56328
 
d1ebc00
 
ab56328
d1ebc00
 
ab56328
d1ebc00
 
 
 
ab56328
d1ebc00
 
ab56328
 
d1ebc00
 
ab56328
 
d1ebc00
 
 
 
ab56328
d1ebc00
 
ab56328
d1ebc00
 
ab56328
 
d1ebc00
 
 
 
 
 
 
ab56328
 
 
d1ebc00
 
 
ab56328
d1ebc00
 
ab56328
 
d1ebc00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List, Dict
import streamlit as st

def get_document_context(file_path: str, query: str) -> str:
    """
    Lightweight document retrieval using TF-IDF instead of FAISS.
    """
    print("--- Using TF-IDF for document retrieval ---")
    
    # Load document
    if file_path.endswith(".pdf"):
        try:
            from pypdf import PdfReader
            reader = PdfReader(file_path)
            documents = []
            for page in reader.pages:
                text = page.extract_text()
                if text.strip():
                    documents.append(text)
        except:
            return "Error: Could not read PDF file."
            
    elif file_path.endswith(".txt"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                # Split into chunks of ~1000 characters
                documents = [content[i:i+1000] for i in range(0, len(content), 800)]
        except:
            return "Error: Could not read text file."
    else:
        return "Error: Unsupported file format. Please upload a .pdf or .txt file."

    if not documents:
        return "Error: Document is empty or could not be read."

    try:
        # Create TF-IDF vectors - this is our "embedding" replacement
        vectorizer = TfidfVectorizer(
            stop_words='english', 
            max_features=5000,
            ngram_range=(1, 2)  # Include bigrams for better context
        )
        
        # Transform documents and query
        doc_vectors = vectorizer.fit_transform(documents)
        query_vector = vectorizer.transform([query])
        
        # Calculate similarities
        similarities = cosine_similarity(query_vector, doc_vectors).flatten()
        
        # Get top 3 most relevant chunks
        top_indices = similarities.argsort()[-3:][::-1]
        
        context_chunks = []
        for idx in top_indices:
            if similarities[idx] > 0.1:  # Only include if reasonably relevant
                context_chunks.append(documents[idx])
        
        context = "\n\n".join(context_chunks)
        return context if context else "No relevant context found in the document."
        
    except Exception as e:
        print(f"An error occurred during document processing: {e}")
        return "Error: Failed to process the provided document."

def run_rag_agent(user_prompt: str, file_path: str) -> str:
    """
    The main agentic function - keep the same interface as before.
    """
    print("--- RAG Agent Activated (Lightweight TF-IDF Mode) ---")
    
    # Generate optimized search query using LLM (same logic as before)
    from llm_setup import llm
    
    if not llm:
        return "Error: LLM not available for query generation."
    
    try:
        search_prompt = f"""You are a research assistant. Based on the user's story idea, what is the single most
important keyword or question to search for within their provided document to find relevant context?

User's Story Idea: '{user_prompt}'

Optimized Search Query for Document:"""

        response = llm.invoke(search_prompt)
        search_query = response.content.strip()
        print(f"Generated Search Query: {search_query}")
    except Exception as e:
        print(f"Query generation failed, using original prompt: {e}")
        search_query = user_prompt
    
    # Use our lightweight retrieval
    context = get_document_context(file_path, search_query)
    
    print("--- RAG Agent Finished ---")
    return context