Spaces:

ChrisSacrumCor
/

TorchRAG

Build error

App Files Files Community

ChrisSacrumCor commited on May 27, 2025

Commit

f1b5c29

verified ·

1 Parent(s): ddeb653

Create app.py

Browse files

Files changed (1) hide show

app.py +435 -0

app.py ADDED Viewed

	@@ -0,0 +1,435 @@

+import gradio as gr
+import lancedb
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langgraph.graph import StateGraph, END
+from langchain.tools import tool
+from langgraph.prebuilt import create_react_agent
+import os
+import shutil
+from typing import List, Dict, Optional, Annotated
+from pydantic import BaseModel
+import PyPDF2
+from langgraph.graph.message import add_messages
+import traceback
+# Global setup
+db = lancedb.connect("./global_vector_db")
+embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+llm = ChatOpenAI(model="gpt-3.5-turbo")
+def init_documents_table():
+    table_name = "documents_v2"  # Use new table name to avoid corrupted schema
+    try:
+        documents_table = db.open_table(table_name)
+        print(f"✅ Opened existing table: {table_name}")
+        return documents_table, "embedding"
+    except Exception as e:
+        print(f"🔄 Creating new table {table_name}... ({e})")
+        # Create a clean table with proper vector schema
+        sample_doc = [{
+            "text": "sample initialization text",
+            "embedding": embeddings.embed_query("sample"),
+            "source": "init",
+            "doc_id": "init",
+            "chunk_id": 0,
+            "summary": "initialization"
+        }]
+        documents_table = db.create_table(table_name, sample_doc)
+        print(f"✅ Created new table: {table_name}")
+        return documents_table, "embedding"
+documents_table, vector_column_name = init_documents_table()
+def extract_text_with_pypdf2(file_path: str) -> str:
+    """Extract text using PyPDF2 as primary method"""
+    try:
+        print(f"📖 Extracting text with PyPDF2...")
+        text = ""
+        with open(file_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            print(f"📄 Found {len(pdf_reader.pages)} pages")
+            for page_num, page in enumerate(pdf_reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    if page_text and page_text.strip():
+                        text += f"\n--- Page {page_num + 1} ---\n{page_text.strip()}\n"
+                        print(f"✅ Extracted {len(page_text)} chars from page {page_num + 1}")
+                    else:
+                        print(f"⚠️ No text on page {page_num + 1}")
+                except Exception as page_error:
+                    print(f"❌ Error extracting page {page_num + 1}: {page_error}")
+                    continue
+        return text.strip()
+    except Exception as e:
+        print(f"❌ PyPDF2 extraction failed: {e}")
+        return ""
+def extract_text_with_docling(file_path: str) -> str:
+    """Try Docling extraction with better error handling"""
+    try:
+        from docling import DocumentConverter
+        converter = DocumentConverter()
+        print(f"📄 Trying Docling conversion...")
+        result = converter.convert(file_path)
+        text = ""
+        # Debug the result structure
+        print(f"🔍 Docling result type: {type(result)}")
+        print(f"🔍 Docling result attributes: {dir(result)}")
+        # Try different ways to access the content
+        if hasattr(result, 'document'):
+            doc = result.document
+            print(f"🔍 Document type: {type(doc)}")
+            print(f"🔍 Document attributes: {dir(doc)}")
+            if hasattr(doc, 'pages'):
+                print(f"🔍 Pages type: {type(doc.pages)}")
+                print(f"🔍 Number of pages: {len(doc.pages) if hasattr(doc.pages, '__len__') else 'unknown'}")
+                # Check what pages actually contains
+                if hasattr(doc.pages, '__iter__'):
+                    for i, page in enumerate(doc.pages):
+                        print(f"🔍 Page {i} type: {type(page)}")
+                        if hasattr(page, 'text'):
+                            page_text = page.text
+                            if page_text and len(str(page_text).strip()) > 50:
+                                text += f"\n--- Page {i + 1} ---\n{page_text}\n"
+                        elif hasattr(page, 'content'):
+                            page_text = str(page.content)
+                            if page_text and len(page_text.strip()) > 50:
+                                text += f"\n--- Page {i + 1} ---\n{page_text}\n"
+                        else:
+                            print(f"⚠️ Page {i} has no text/content attribute")
+            elif hasattr(doc, 'text'):
+                text = doc.text
+            elif hasattr(doc, 'content'):
+                text = str(doc.content)
+        elif hasattr(result, 'text'):
+            text = result.text
+        elif hasattr(result, 'content'):
+            text = str(result.content)
+        return text.strip()
+    except Exception as e:
+        print(f"❌ Docling extraction failed: {e}")
+        traceback.print_exc()
+        return ""
+@tool
+def add_document_to_knowledge_base(file_path: str) -> str:
+    """Process and add a document to the global knowledge base."""
+    try:
+        print(f"🔍 Processing file: {file_path}")
+        if not os.path.exists(file_path):
+            return f"❌ File not found: {file_path}"
+        doc_id = os.path.basename(file_path)
+        # Try multiple extraction methods
+        extracted_text = ""
+        # Method 1: Try PyPDF2 first (more reliable)
+        if file_path.lower().endswith('.pdf'):
+            extracted_text = extract_text_with_pypdf2(file_path)
+        # Method 2: Try Docling if PyPDF2 failed
+        if not extracted_text:
+            print("🔄 PyPDF2 failed, trying Docling...")
+            extracted_text = extract_text_with_docling(file_path)
+        # Method 3: Simple file reading for text files
+        if not extracted_text and file_path.lower().endswith(('.txt', '.md')):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    extracted_text = f.read()
+            except Exception as e:
+                print(f"❌ Text file reading failed: {e}")
+        if not extracted_text or len(extracted_text.strip()) < 50:
+            return f"❌ Could not extract meaningful text from {doc_id}. File may be image-based PDF or corrupted."
+        print(f"📝 Successfully extracted {len(extracted_text)} characters")
+        # Create summary
+        summary_text = extracted_text[:3000]  # Limit for API
+        summary_prompt = f"""Summarize this document in 2-3 clear sentences, focusing on the main topics and key points:
+{summary_text}"""
+        try:
+            summary_response = llm.invoke(summary_prompt)
+            doc_summary = summary_response.content.strip()
+        except Exception as e:
+            print(f"⚠️ Summary generation failed: {e}")
+            doc_summary = f"Document containing {len(extracted_text)} characters of text"
+        print(f"✅ Summary: {doc_summary}")
+        # Split into chunks (simple approach)
+        chunk_size = 1000
+        overlap = 100
+        text_chunks = []
+        for i in range(0, len(extracted_text), chunk_size - overlap):
+            chunk = extracted_text[i:i + chunk_size].strip()
+            if len(chunk) > 100:  # Skip tiny chunks
+                text_chunks.append(chunk)
+        print(f"🔄 Creating {len(text_chunks)} chunks and embeddings...")
+        # Create embeddings and prepare data
+        chunks_data = []
+        for i, chunk_text in enumerate(text_chunks):
+            try:
+                embedding = embeddings.embed_query(chunk_text)
+                chunk_data = {
+                    "text": chunk_text,
+                    "embedding": embedding,  # Always use 'embedding' as column name
+                    "source": doc_id,
+                    "doc_id": doc_id,
+                    "chunk_id": i,
+                    "summary": doc_summary
+                }
+                chunks_data.append(chunk_data)
+            except Exception as e:
+                print(f"⚠️ Failed to embed chunk {i}: {e}")
+                continue
+        if not chunks_data:
+            return f"❌ Failed to create any valid chunks from {doc_id}"
+        # Add to LanceDB
+        print(f"💾 Adding {len(chunks_data)} chunks to LanceDB...")
+        documents_table.add(chunks_data)
+        return f"""✅ Successfully processed {doc_id}:
+- Extracted: {len(extracted_text)} characters
+- Created: {len(chunks_data)} chunks
+- Added to knowledge base
+- Summary: {doc_summary}"""
+    except Exception as e:
+        print(f"❌ Error processing document: {str(e)}")
+        traceback.print_exc()
+        return f"❌ Error processing document: {str(e)}"
+@tool
+def search_text_directly(query: str, limit: int = 3) -> str:
+    """Search document text directly using keyword matching (fallback method)."""
+    try:
+        print(f"🔍 Direct text search for: {query}")
+        # Get all documents and search by text matching
+        all_docs = documents_table.to_pandas()
+        if all_docs.empty:
+            return "No documents in knowledge base."
+        # Simple keyword matching
+        query_lower = query.lower()
+        matches = []
+        for _, doc in all_docs.iterrows():
+            text_lower = doc['text'].lower()
+            if any(word in text_lower for word in query_lower.split()):
+                matches.append(doc)
+        if not matches:
+            return f"No text matches found for '{query}'"
+        # Sort by relevance (count of matching words)
+        def relevance_score(text):
+            return sum(1 for word in query_lower.split() if word in text.lower())
+        matches.sort(key=lambda x: relevance_score(x['text']), reverse=True)
+        matches = matches[:limit]
+        print(f"📚 Found {len(matches)} text matches")
+        # Format results
+        formatted_results = []
+        for i, doc in enumerate(matches, 1):
+            text_preview = doc['text'][:500] + "..." if len(doc['text']) > 500 else doc['text']
+            formatted_results.append(
+                f"📄 **Match {i}** (from {doc['source']}):\n{text_preview}\n"
+            )
+        return "\n" + "="*60 + "\n".join(formatted_results)
+    except Exception as e:
+        print(f"❌ Error in direct text search: {str(e)}")
+        return f"❌ Error in direct text search: {str(e)}"
+    """Search the global knowledge base for relevant information."""
+    try:
+        print(f"🔍 Searching knowledge base for: {query}")
+        # Create query embedding
+        query_vector = embeddings.embed_query(query)
+        # Simple search without specifying vector column (let LanceDB auto-detect)
+        results = documents_table.search(query_vector).limit(limit).to_list()
+        if not results:
+            return "No relevant documents found in knowledge base."
+        print(f"📚 Found {len(results)} relevant chunks")
+        # Format results nicely
+        formatted_results = []
+        for i, doc in enumerate(results, 1):
+            text_preview = doc['text'][:500] + "..." if len(doc['text']) > 500 else doc['text']
+            formatted_results.append(
+                f"📄 **Result {i}** (from {doc['source']}):\n{text_preview}\n"
+            )
+        return "\n" + "="*60 + "\n".join(formatted_results)
+    except Exception as e:
+        print(f"❌ Error searching knowledge base: {str(e)}")
+        traceback.print_exc()
+        return f"❌ Error searching knowledge base: {str(e)}"
+# State definition using modern LangGraph patterns
+class AgentState(BaseModel):
+    messages: Annotated[list, add_messages]
+    user_input: str = ""
+    uploaded_file_path: Optional[str] = None
+def agent_node(state: AgentState):
+    """Agent node using create_react_agent"""
+    tools = [search_knowledge_base, add_document_to_knowledge_base, search_text_directly]
+    # Create the agent
+    agent = create_react_agent(llm, tools)
+    # Prepare the message
+    user_message = state.user_input
+    if state.uploaded_file_path:
+        user_message = f"I uploaded a file: {state.uploaded_file_path}. Please process it into the knowledge base and tell me about its contents. Then answer: {user_message}"
+    # Invoke the agent
+    try:
+        result = agent.invoke({
+            "messages": [{"role": "user", "content": user_message}]
+        })
+        return {
+            "messages": result["messages"],
+            "user_input": state.user_input,
+            "uploaded_file_path": state.uploaded_file_path
+        }
+    except Exception as e:
+        error_msg = f"❌ Agent error: {str(e)}"
+        print(error_msg)
+        traceback.print_exc()
+        return {
+            "messages": state.messages + [{"role": "assistant", "content": error_msg}],
+            "user_input": state.user_input,
+            "uploaded_file_path": state.uploaded_file_path
+        }
+# Build workflow
+workflow = StateGraph(AgentState)
+workflow.add_node("agent", agent_node)
+workflow.set_entry_point("agent")
+workflow.add_edge("agent", END)
+app = workflow.compile()
+def process_chat(message, history, uploaded_file):
+    """Process chat with file upload handling"""
+    print(f"📥 Message: {message}")
+    print(f"📁 File: {uploaded_file}")
+    # Handle file upload
+    permanent_file_path = None
+    if uploaded_file is not None:
+        upload_dir = "./uploaded_docs"
+        os.makedirs(upload_dir, exist_ok=True)
+        filename = os.path.basename(uploaded_file.name)
+        permanent_file_path = os.path.join(upload_dir, filename)
+        try:
+            shutil.copy2(uploaded_file.name, permanent_file_path)
+            print(f"📋 Copied to: {permanent_file_path}")
+        except Exception as e:
+            print(f"❌ File copy failed: {e}")
+            permanent_file_path = None
+    # Create state and run agent
+    state = AgentState(
+        messages=[],
+        user_input=message,
+        uploaded_file_path=permanent_file_path
+    )
+    try:
+        result = app.invoke(state)
+        # Get the last assistant message
+        assistant_messages = [msg for msg in result['messages']
+                             if hasattr(msg, 'type') and msg.type == 'ai' or
+                                (isinstance(msg, dict) and msg.get('role') == 'assistant')]
+        if assistant_messages:
+            response = assistant_messages[-1].content if hasattr(assistant_messages[-1], 'content') else str(assistant_messages[-1])
+        else:
+            # Fallback: get the last message regardless of type
+            last_msg = result['messages'][-1] if result['messages'] else None
+            if last_msg:
+                response = last_msg.content if hasattr(last_msg, 'content') else str(last_msg)
+            else:
+                response = "No response generated"
+    except Exception as e:
+        response = f"❌ Error: {str(e)}"
+        print(f"❌ App error: {e}")
+        traceback.print_exc()
+    history.append([message, response])
+    return history, ""
+# Gradio interface
+with gr.Blocks(title="Knowledge Base Agent") as demo:
+    gr.Markdown("# 📚 Knowledge Base Agent")
+    gr.Markdown("Upload PDF documents and ask questions! Uses PyPDF2 as primary extraction method.")
+    chatbot = gr.Chatbot(height=500)
+    with gr.Row():
+        msg = gr.Textbox(
+            label="Message",
+            placeholder="Upload a document or ask a question...",
+            scale=4
+        )
+        upload = gr.File(
+            label="Upload",
+            file_types=[".pdf", ".docx", ".txt", ".md"],
+            scale=1
+        )
+    msg.submit(
+        process_chat,
+        inputs=[msg, chatbot, upload],
+        outputs=[chatbot, msg]
+    )
+if __name__ == "__main__":
+    demo.launch(debug=True)