Spaces:

ChrisSacrumCor
/

TorchRAG

Build error

App Files Files Community

ChrisSacrumCor commited on May 27, 2025

Commit

c0f1437

verified ·

1 Parent(s): 069a9d8

Update app.py

Browse files

Files changed (1) hide show

app.py +216 -396

app.py CHANGED Viewed

@@ -1,435 +1,255 @@
-import gradio as gr
-import lancedb
-from langchain_openai import ChatOpenAI, OpenAIEmbeddings
-from langgraph.graph import StateGraph, END
-from langchain.tools import tool
-from langgraph.prebuilt import create_react_agent
 import os
-import shutil
-from typing import List, Dict, Optional, Annotated
-from pydantic import BaseModel
 import PyPDF2
-from langgraph.graph.message import add_messages
-import traceback
-# Global setup
-db = lancedb.connect("./global_vector_db")
-embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
-llm = ChatOpenAI(model="gpt-3.5-turbo")
-def init_documents_table():
-    table_name = "documents_v2"  # Use new table name to avoid corrupted schema
-    try:
-        documents_table = db.open_table(table_name)
-        print(f"✅ Opened existing table: {table_name}")
-        return documents_table, "embedding"
-    except Exception as e:
-        print(f"🔄 Creating new table {table_name}... ({e})")
-        # Create a clean table with proper vector schema
-        sample_doc = [{
-            "text": "sample initialization text",
-            "embedding": embeddings.embed_query("sample"),
-            "source": "init",
-            "doc_id": "init",
-            "chunk_id": 0,
-            "summary": "initialization"
-        }]
-        documents_table = db.create_table(table_name, sample_doc)
-        print(f"✅ Created new table: {table_name}")
-        return documents_table, "embedding"
-documents_table, vector_column_name = init_documents_table()
-def extract_text_with_pypdf2(file_path: str) -> str:
-    """Extract text using PyPDF2 as primary method"""
     try:
-        print(f"📖 Extracting text with PyPDF2...")
-        text = ""
-        with open(file_path, 'rb') as file:
-            pdf_reader = PyPDF2.PdfReader(file)
-            print(f"📄 Found {len(pdf_reader.pages)} pages")
-            for page_num, page in enumerate(pdf_reader.pages):
-                try:
-                    page_text = page.extract_text()
-                    if page_text and page_text.strip():
-                        text += f"\n--- Page {page_num + 1} ---\n{page_text.strip()}\n"
-                        print(f"✅ Extracted {len(page_text)} chars from page {page_num + 1}")
-                    else:
-                        print(f"⚠️ No text on page {page_num + 1}")
-                except Exception as page_error:
-                    print(f"❌ Error extracting page {page_num + 1}: {page_error}")
-                    continue
-        return text.strip()
     except Exception as e:
-        print(f"❌ PyPDF2 extraction failed: {e}")
-        return ""
-def extract_text_with_docling(file_path: str) -> str:
-    """Try Docling extraction with better error handling"""
     try:
-        from docling import DocumentConverter
-        converter = DocumentConverter()
-        print(f"📄 Trying Docling conversion...")
-        result = converter.convert(file_path)
-        text = ""
-        # Debug the result structure
-        print(f"🔍 Docling result type: {type(result)}")
-        print(f"🔍 Docling result attributes: {dir(result)}")
-        # Try different ways to access the content
-        if hasattr(result, 'document'):
-            doc = result.document
-            print(f"🔍 Document type: {type(doc)}")
-            print(f"🔍 Document attributes: {dir(doc)}")
-            if hasattr(doc, 'pages'):
-                print(f"🔍 Pages type: {type(doc.pages)}")
-                print(f"🔍 Number of pages: {len(doc.pages) if hasattr(doc.pages, '__len__') else 'unknown'}")
-                # Check what pages actually contains
-                if hasattr(doc.pages, '__iter__'):
-                    for i, page in enumerate(doc.pages):
-                        print(f"🔍 Page {i} type: {type(page)}")
-                        if hasattr(page, 'text'):
-                            page_text = page.text
-                            if page_text and len(str(page_text).strip()) > 50:
-                                text += f"\n--- Page {i + 1} ---\n{page_text}\n"
-                        elif hasattr(page, 'content'):
-                            page_text = str(page.content)
-                            if page_text and len(page_text.strip()) > 50:
-                                text += f"\n--- Page {i + 1} ---\n{page_text}\n"
-                        else:
-                            print(f"⚠️ Page {i} has no text/content attribute")
-            elif hasattr(doc, 'text'):
-                text = doc.text
-            elif hasattr(doc, 'content'):
-                text = str(doc.content)
-        elif hasattr(result, 'text'):
-            text = result.text
-        elif hasattr(result, 'content'):
-            text = str(result.content)
-        return text.strip()
     except Exception as e:
-        print(f"❌ Docling extraction failed: {e}")
-        traceback.print_exc()
-        return ""
-@tool
-def add_document_to_knowledge_base(file_path: str) -> str:
-    """Process and add a document to the global knowledge base."""
-    try:
-        print(f"🔍 Processing file: {file_path}")
-        if not os.path.exists(file_path):
-            return f"❌ File not found: {file_path}"
-        doc_id = os.path.basename(file_path)
-        # Try multiple extraction methods
-        extracted_text = ""
-        # Method 1: Try PyPDF2 first (more reliable)
-        if file_path.lower().endswith('.pdf'):
-            extracted_text = extract_text_with_pypdf2(file_path)
-        # Method 2: Try Docling if PyPDF2 failed
-        if not extracted_text:
-            print("🔄 PyPDF2 failed, trying Docling...")
-            extracted_text = extract_text_with_docling(file_path)
-        # Method 3: Simple file reading for text files
-        if not extracted_text and file_path.lower().endswith(('.txt', '.md')):
-            try:
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    extracted_text = f.read()
-            except Exception as e:
-                print(f"❌ Text file reading failed: {e}")
-        if not extracted_text or len(extracted_text.strip()) < 50:
-            return f"❌ Could not extract meaningful text from {doc_id}. File may be image-based PDF or corrupted."
-        print(f"📝 Successfully extracted {len(extracted_text)} characters")
-        # Create summary
-        summary_text = extracted_text[:3000]  # Limit for API
-        summary_prompt = f"""Summarize this document in 2-3 clear sentences, focusing on the main topics and key points:
-{summary_text}"""
-        try:
-            summary_response = llm.invoke(summary_prompt)
-            doc_summary = summary_response.content.strip()
-        except Exception as e:
-            print(f"⚠️ Summary generation failed: {e}")
-            doc_summary = f"Document containing {len(extracted_text)} characters of text"
-        print(f"✅ Summary: {doc_summary}")
-        # Split into chunks (simple approach)
-        chunk_size = 1000
-        overlap = 100
-        text_chunks = []
-        for i in range(0, len(extracted_text), chunk_size - overlap):
-            chunk = extracted_text[i:i + chunk_size].strip()
-            if len(chunk) > 100:  # Skip tiny chunks
-                text_chunks.append(chunk)
-        print(f"🔄 Creating {len(text_chunks)} chunks and embeddings...")
-        # Create embeddings and prepare data
-        chunks_data = []
-        for i, chunk_text in enumerate(text_chunks):
             try:
-                embedding = embeddings.embed_query(chunk_text)
-                chunk_data = {
-                    "text": chunk_text,
-                    "embedding": embedding,  # Always use 'embedding' as column name
-                    "source": doc_id,
-                    "doc_id": doc_id,
-                    "chunk_id": i,
-                    "summary": doc_summary
-                }
-                chunks_data.append(chunk_data)
             except Exception as e:
-                print(f"⚠️ Failed to embed chunk {i}: {e}")
                 continue
-        if not chunks_data:
-            return f"❌ Failed to create any valid chunks from {doc_id}"
-        # Add to LanceDB
-        print(f"💾 Adding {len(chunks_data)} chunks to LanceDB...")
-        documents_table.add(chunks_data)
-        return f"""✅ Successfully processed {doc_id}:
-- Extracted: {len(extracted_text)} characters
-- Created: {len(chunks_data)} chunks
-- Added to knowledge base
-- Summary: {doc_summary}"""
-    except Exception as e:
-        print(f"❌ Error processing document: {str(e)}")
-        traceback.print_exc()
-        return f"❌ Error processing document: {str(e)}"
-@tool
-def search_text_directly(query: str, limit: int = 3) -> str:
-    """Search document text directly using keyword matching (fallback method)."""
-    try:
-        print(f"🔍 Direct text search for: {query}")
-        # Get all documents and search by text matching
-        all_docs = documents_table.to_pandas()
-        if all_docs.empty:
-            return "No documents in knowledge base."
-        # Simple keyword matching
-        query_lower = query.lower()
-        matches = []
-        for _, doc in all_docs.iterrows():
-            text_lower = doc['text'].lower()
-            if any(word in text_lower for word in query_lower.split()):
-                matches.append(doc)
-        if not matches:
-            return f"No text matches found for '{query}'"
-        # Sort by relevance (count of matching words)
-        def relevance_score(text):
-            return sum(1 for word in query_lower.split() if word in text.lower())
-        matches.sort(key=lambda x: relevance_score(x['text']), reverse=True)
-        matches = matches[:limit]
-        print(f"📚 Found {len(matches)} text matches")
-        # Format results
-        formatted_results = []
-        for i, doc in enumerate(matches, 1):
-            text_preview = doc['text'][:500] + "..." if len(doc['text']) > 500 else doc['text']
-            formatted_results.append(
-                f"📄 **Match {i}** (from {doc['source']}):\n{text_preview}\n"
-            )
-        return "\n" + "="*60 + "\n".join(formatted_results)
     except Exception as e:
-        print(f"❌ Error in direct text search: {str(e)}")
-        return f"❌ Error in direct text search: {str(e)}"
-    """Search the global knowledge base for relevant information."""
-    try:
-        print(f"🔍 Searching knowledge base for: {query}")
-        # Create query embedding
-        query_vector = embeddings.embed_query(query)
-        # Simple search without specifying vector column (let LanceDB auto-detect)
-        results = documents_table.search(query_vector).limit(limit).to_list()
-        if not results:
-            return "No relevant documents found in knowledge base."
-        print(f"📚 Found {len(results)} relevant chunks")
-        # Format results nicely
-        formatted_results = []
-        for i, doc in enumerate(results, 1):
-            text_preview = doc['text'][:500] + "..." if len(doc['text']) > 500 else doc['text']
-            formatted_results.append(
-                f"📄 **Result {i}** (from {doc['source']}):\n{text_preview}\n"
-            )
-        return "\n" + "="*60 + "\n".join(formatted_results)
-    except Exception as e:
-        print(f"❌ Error searching knowledge base: {str(e)}")
-        traceback.print_exc()
-        return f"❌ Error searching knowledge base: {str(e)}"
-# State definition using modern LangGraph patterns
-class AgentState(BaseModel):
-    messages: Annotated[list, add_messages]
-    user_input: str = ""
-    uploaded_file_path: Optional[str] = None
-def agent_node(state: AgentState):
-    """Agent node using create_react_agent"""
-    tools = [search_knowledge_base, add_document_to_knowledge_base, search_text_directly]
-    # Create the agent
-    agent = create_react_agent(llm, tools)
-    # Prepare the message
-    user_message = state.user_input
-    if state.uploaded_file_path:
-        user_message = f"I uploaded a file: {state.uploaded_file_path}. Please process it into the knowledge base and tell me about its contents. Then answer: {user_message}"
-    # Invoke the agent
-    try:
-        result = agent.invoke({
-            "messages": [{"role": "user", "content": user_message}]
-        })
-        return {
-            "messages": result["messages"],
-            "user_input": state.user_input,
-            "uploaded_file_path": state.uploaded_file_path
-        }
-    except Exception as e:
-        error_msg = f"❌ Agent error: {str(e)}"
-        print(error_msg)
-        traceback.print_exc()
-        return {
-            "messages": state.messages + [{"role": "assistant", "content": error_msg}],
-            "user_input": state.user_input,
-            "uploaded_file_path": state.uploaded_file_path
-        }
-# Build workflow
-workflow = StateGraph(AgentState)
-workflow.add_node("agent", agent_node)
-workflow.set_entry_point("agent")
-workflow.add_edge("agent", END)
-app = workflow.compile()
-def process_chat(message, history, uploaded_file):
-    """Process chat with file upload handling"""
-    print(f"📥 Message: {message}")
-    print(f"📁 File: {uploaded_file}")
-    # Handle file upload
-    permanent_file_path = None
-    if uploaded_file is not None:
-        upload_dir = "./uploaded_docs"
-        os.makedirs(upload_dir, exist_ok=True)
-        filename = os.path.basename(uploaded_file.name)
-        permanent_file_path = os.path.join(upload_dir, filename)
-        try:
-            shutil.copy2(uploaded_file.name, permanent_file_path)
-            print(f"📋 Copied to: {permanent_file_path}")
-        except Exception as e:
-            print(f"❌ File copy failed: {e}")
-            permanent_file_path = None
-    # Create state and run agent
-    state = AgentState(
-        messages=[],
-        user_input=message,
-        uploaded_file_path=permanent_file_path
-    )
-    try:
-        result = app.invoke(state)
-        # Get the last assistant message
-        assistant_messages = [msg for msg in result['messages']
-                             if hasattr(msg, 'type') and msg.type == 'ai' or
-                                (isinstance(msg, dict) and msg.get('role') == 'assistant')]
-        if assistant_messages:
-            response = assistant_messages[-1].content if hasattr(assistant_messages[-1], 'content') else str(assistant_messages[-1])
-        else:
-            # Fallback: get the last message regardless of type
-            last_msg = result['messages'][-1] if result['messages'] else None
-            if last_msg:
-                response = last_msg.content if hasattr(last_msg, 'content') else str(last_msg)
-            else:
-                response = "No response generated"
-    except Exception as e:
-        response = f"❌ Error: {str(e)}"
-        print(f"❌ App error: {e}")
-        traceback.print_exc()
-    history.append([message, response])
-    return history, ""
-# Gradio interface
-with gr.Blocks(title="Knowledge Base Agent") as demo:
-    gr.Markdown("# 📚 Knowledge Base Agent")
-    gr.Markdown("Upload PDF documents and ask questions! Uses PyPDF2 as primary extraction method.")
-    chatbot = gr.Chatbot(height=500)
     with gr.Row():
-        msg = gr.Textbox(
-            label="Message",
-            placeholder="Upload a document or ask a question...",
-            scale=4
-        )
-        upload = gr.File(
-            label="Upload",
-            file_types=[".pdf", ".docx", ".txt", ".md"],
-            scale=1
         )
     msg.submit(
-        process_chat,
-        inputs=[msg, chatbot, upload],
-        outputs=[chatbot, msg]
     )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 import os
+import pathlib
+from dotenv import load_dotenv
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_chroma import Chroma
+from langchain.schema import Document
+from langchain.chains import ConversationalRetrievalChain
+from langchain.chains.base import Chain
+from langchain.memory import ConversationBufferMemory
+import gradio as gr
+from langchain_core.retrievers import BaseRetriever
+import re
 import PyPDF2
+# Load environment variables and constants
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 200
+load_dotenv()
+api_key = os.environ.get("OPENAI_API_KEY")
+if not api_key:
+    raise ValueError("OPENAI_API_KEY environment variable is not set")
+# Document Loader
+class DocumentLoaderException(Exception):
+    pass
+class DocumentLoader(object):
+    supported_files = {
+        "pdf": PyPDFLoader,
+        "txt": TextLoader,
+    }
+def load_documents(file_path: str) -> list[Document]:
+    """Load documents from file path"""
+    ext = pathlib.Path(file_path).suffix.lower().lstrip('.')
+    loader_class = DocumentLoader.supported_files.get(ext)
+    if not loader_class:
+        raise DocumentLoaderException(f"Unsupported file type: {ext}. Please provide a .txt or .pdf file")
+    loader = loader_class(file_path)
+    docs = loader.load()
+    return docs
+# Embeddings and vector storage
+def configure_retriever(docs: list[Document]) -> BaseRetriever:
+    """Configure retriever for document search"""
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+    chunks = text_splitter.split_documents(docs)
+    embeddings = OpenAIEmbeddings()
+    vectorstore = Chroma.from_documents(
+        documents=chunks,
+        embedding=embeddings,
+        persist_directory="chroma_db"
+    )
+    retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 6, "fetch_k":20})
+    return retriever
+# Chatbot
+def configure_chatbot(retriever: BaseRetriever) -> Chain:
+    """Configure the conversational chatbot"""
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    model = ChatOpenAI(
+        model="gpt-4o-mini",
+        temperature=2,
+        streaming=True,
+        max_tokens=15000
+    )
+    return ConversationalRetrievalChain.from_llm(
+        llm=model,
+        retriever=retriever,
+        memory=memory,
+        verbose=True
+    )
+# Gradio app functions
+def process_files(files):
+    """Process uploaded files and create chatbot"""
+    if not files:
+        return None
+    docs = []
+    for file in files:
+        if os.path.exists(file.name):
+            docs.extend(load_documents(file.name))
+    if not docs:
+        raise DocumentLoaderException("No documents were successfully loaded")
+    retriever = configure_retriever(docs)
+    return configure_chatbot(retriever)
+def respond(message, chat_history, qa_chain):
+    """Handle chat responses"""
+    if not qa_chain:
+        chat_history.append({"role": "user", "content": message})
+        chat_history.append({"role": "assistant", "content": "Please upload documents first."})
+        return "", chat_history
     try:
+        response = qa_chain.invoke({"question": message})
+        chat_history.append({"role": "user", "content": message})
+        chat_history.append({"role": "assistant", "content": response["answer"]})
+        return "", chat_history
     except Exception as e:
+        error_message = f"Error: {str(e)}"
+        chat_history.append({"role": "user", "content": message})
+        chat_history.append({"role": "assistant", "content": error_message})
+        return "", chat_history
+def process_files_with_status(files):
+    """Process files and return status"""
+    if not files:
+        return None, "Please upload at least one document."
     try:
+        result = process_files(files)
+        return result, "Documents processed successfully!"
     except Exception as e:
+        return None, f"Error: {str(e)}"
+def clean_text(text):
+    # Remove special characters and extra whitespace
+    text = re.sub(r'[^\w\s.,!?-]', ' ', text)
+    # Remove multiple spaces
+    text = re.sub(r'\s+', ' ', text)
+    # Remove empty lines
+    text = re.sub(r'\n\s*\n', '\n', text)
+    # Remove lines that are just numbers or very short
+    text = '\n'.join(line for line in text.split('\n')
+                    if len(line.strip()) > 3 and not line.strip().isdigit())
+    # Remove common metadata patterns
+    text = re.sub(r'File size.*?MB', '', text)
+    text = re.sub(r'Format:.*?Edition', '', text)
+    text = re.sub(r'\d+\.\d+\s+out of \d+ stars', '', text)
+    text = re.sub(r'\d+\s+ratings', '', text)
+    # Remove "Read more" and similar phrases
+    text = re.sub(r'Read more.*$', '', text)
+    # Remove empty lines again
+    text = re.sub(r'\n\s*\n', '\n', text)
+    return text.strip()
+def process_pdf(pdf_file):
+    try:
+        # Create a PDF reader object
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        # Extract text from all pages
+        text = ""
+        for page in pdf_reader.pages:
             try:
+                page_text = page.extract_text()
+                if page_text:
+                    # Clean the text immediately after extraction
+                    cleaned_page = clean_text(page_text)
+                    if cleaned_page:  # Only add non-empty pages
+                        text += cleaned_page + "\n"
             except Exception as e:
+                print(f"Warning: Error extracting text from page: {str(e)}")
                 continue
+        if not text.strip():
+            raise ValueError("No text could be extracted from the PDF")
+        # Split into chunks
+        chunks = split_into_chunks(text)
+        return chunks
     except Exception as e:
+        print(f"Error in process_pdf: {str(e)}")
+        raise
+def split_into_chunks(text, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
+    """
+    Split text into overlapping chunks of specified size.
+    Args:
+        text (str): The text to split
+        chunk_size (int): Maximum size of each chunk
+        chunk_overlap (int): Number of characters to overlap between chunks
+    Returns:
+        list: List of text chunks
+    """
+    chunks = []
+    start = 0
+    text_length = len(text)
+    while start < text_length:
+        end = start + chunk_size
+        if start > 0:
+            start = start - chunk_overlap
+        if end >= text_length:
+            chunks.append(text[start:])
+            break
+        if end < text_length:
+            paragraph_break = text.rfind('\n\n', start, end)
+            if paragraph_break != -1:
+                end = paragraph_break
+            else:
+                sentence_break = text.rfind('. ', start, end)
+                if sentence_break != -1:
+                    end = sentence_break + 1
+        chunks.append(text[start:end].strip())
+        start = end
+    return chunks
+# Gradio Interface
+with gr.Blocks(title="TorchAIassist") as demo:
+    gr.Markdown("# TorchAIassist")
+    gr.Markdown("A chatbot for your documents")
     with gr.Row():
+        file_output = gr.File(
+            label="Upload your documents",
+            file_count="multiple",
+            file_types=[".pdf", ".txt"]
         )
+        status = gr.Textbox(label="Status", interactive=False)
+    chatbot = gr.Chatbot(height=600, type="messages")
+    msg = gr.Textbox(
+        label="Ask a question about your documents",
+        placeholder="Let me know what you want to know about your documents"
+    )
+    clear = gr.Button("Clear")
+    qa_chain = gr.State(None)
+    # Event handlers
+    file_output.change(
+        fn=process_files_with_status,
+        inputs=[file_output],
+        outputs=[qa_chain, status]
+    )
     msg.submit(
+        fn=respond,
+        inputs=[msg, chatbot, qa_chain],
+        outputs=[msg, chatbot]
     )
+    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
+    demo.launch()