Spaces:

abakerdp
/

RAGtimeSearch

Sleeping

App Files Files Community

abakerdp commited on Nov 10, 2024

Commit

219895c

verified ·

1 Parent(s): d988be3

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -53

app.py CHANGED Viewed

@@ -1,42 +1,143 @@
 import gradio as gr
 from sentence_transformers import SentenceTransformer
-import json
-from pathlib import Path
 import numpy as np
 from typing import List, Dict
-class SimpleRAG:
     def __init__(self):
         self.model = SentenceTransformer('all-MiniLM-L6-v2')
         self.documents = []
         self.embeddings = []
         self.metadata = []
-    def load_documents(self, filepath: str):
-        with open(filepath) as f:
-            data = json.load(f)
-        for doc in data["documents"]:
-            self.documents.append(doc["content"])
-            self.metadata.append({
-                "title": doc["title"],
-                "source": doc.get("source", "Unknown"),
-                "section": doc.get("section", "General")
-            })
-        # Create embeddings for all documents
-        self.embeddings = self.model.encode(self.documents)
     def search(self, query: str, top_k: int = 5) -> List[Dict]:
-        # Get query embedding
         query_embedding = self.model.encode(query)
-        # Calculate similarities
         similarities = np.dot(self.embeddings, query_embedding) / (
             np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
         )
-        # Get top results
         top_indices = np.argsort(similarities)[-top_k:][::-1]
         results = []
@@ -50,42 +151,51 @@ class SimpleRAG:
         return results
 # Initialize the RAG system
-rag = SimpleRAG()
-try:
-    rag.load_documents("documents.json")
-except Exception as e:
-    print(f"Error loading documents: {e}")
-    # Load a sample document if the file doesn't exist
-    sample_data = {
-        "documents": [
-            {
-                "title": "Sample Document",
-                "content": "This is a sample document. Please add your own documents.json file to see real content.",
-                "source": "Sample",
-                "section": "Test"
-            }
-        ]
-    }
-    with open("documents.json", "w") as f:
-        json.dump(sample_data, f)
-    rag.load_documents("documents.json")
-def search_documents(query, top_k=5):
     if not query.strip():
-        return "Please enter a query"
     results = rag.search(query, top_k)
-    # Format output
     output = ""
-    for result in results:
         metadata = result["metadata"]
         score_percentage = round(result["score"] * 100)
-        output += f"\n\n📚 {metadata['title']}\n"
-        output += f"📍 {metadata['source']} • {metadata['section']} • Relevance: {score_percentage}%\n"
         output += f"───────────────────\n{result['content']}\n"
-    return output
 # Create Gradio interface
 interface = gr.Interface(
@@ -102,19 +212,29 @@ interface = gr.Interface(
             value=5,
             step=1,
             label="Number of results"
         )
     ],
-    outputs=gr.Textbox(
-        label="Search Results",
-        lines=20
-    ),
-    title="Knowledge Base Search",
-    description="Ask questions about your documents and get relevant answers.",
     theme="default",
     allow_flagging="never",
     examples=[
-        ["What is machine learning?"],
-        ["How does this work?"],
     ]
 )

 import gradio as gr
 from sentence_transformers import SentenceTransformer
 import numpy as np
 from typing import List, Dict
+import PyPDF2
+import docx
+import os
+from pathlib import Path
+import json
+import fitz  # PyMuPDF for better PDF handling
+import re
+from tqdm import tqdm
+class DocumentProcessor:
+    def __init__(self, docs_dir="documents"):
+        self.docs_dir = docs_dir
+    def extract_text_from_pdf(self, file_path):
+        try:
+            doc = fitz.open(file_path)
+            text_chunks = []
+            for page_num, page in enumerate(doc):
+                # Extract text
+                text = page.get_text()
+                # Get page dimensions for preview coordinates
+                preview = {
+                    "page": page_num + 1,
+                    "total_pages": len(doc),
+                }
+                # Split into chunks (~ 500 chars each)
+                chunks = self.split_into_chunks(text)
+                for chunk in chunks:
+                    text_chunks.append({
+                        "content": chunk,
+                        "metadata": {
+                            "source": os.path.basename(file_path),
+                            "type": "pdf",
+                            "preview": preview
+                        }
+                    })
+            return text_chunks
+        except Exception as e:
+            print(f"Error processing PDF {file_path}: {e}")
+            return []
+    def extract_text_from_docx(self, file_path):
+        try:
+            doc = docx.Document(file_path)
+            text_chunks = []
+            full_text = ""
+            for para in doc.paragraphs:
+                full_text += para.text + "\n"
+            chunks = self.split_into_chunks(full_text)
+            for chunk in chunks:
+                text_chunks.append({
+                    "content": chunk,
+                    "metadata": {
+                        "source": os.path.basename(file_path),
+                        "type": "docx"
+                    }
+                })
+            return text_chunks
+        except Exception as e:
+            print(f"Error processing DOCX {file_path}: {e}")
+            return []
+    def split_into_chunks(self, text, chunk_size=500, overlap=50):
+        chunks = []
+        start = 0
+        text_length = len(text)
+        while start < text_length:
+            end = start + chunk_size
+            # Adjust chunk end to nearest sentence or paragraph break
+            if end < text_length:
+                # Look for sentence endings (.!?) followed by space or newline
+                match = re.search(r'[.!?]\s+', text[end-50:end+50])
+                if match:
+                    end = end - 50 + match.end()
+            chunk = text[start:end].strip()
+            if chunk:  # Only add non-empty chunks
+                chunks.append(chunk)
+            start = end - overlap
+        return chunks
+    def process_all_documents(self):
+        all_chunks = []
+        if not os.path.exists(self.docs_dir):
+            os.makedirs(self.docs_dir)
+            print(f"Created documents directory at {self.docs_dir}")
+            return all_chunks
+        for file_name in tqdm(os.listdir(self.docs_dir)):
+            file_path = os.path.join(self.docs_dir, file_name)
+            if file_name.lower().endswith('.pdf'):
+                chunks = self.extract_text_from_pdf(file_path)
+                all_chunks.extend(chunks)
+            elif file_name.lower().endswith('.docx'):
+                chunks = self.extract_text_from_docx(file_path)
+                all_chunks.extend(chunks)
+        return all_chunks
+class DocumentRAG:
     def __init__(self):
         self.model = SentenceTransformer('all-MiniLM-L6-v2')
         self.documents = []
         self.embeddings = []
         self.metadata = []
+        self.processor = DocumentProcessor()
+    def load_documents(self):
+        print("Processing documents...")
+        chunks = self.processor.process_all_documents()
+        self.documents = [chunk["content"] for chunk in chunks]
+        self.metadata = [chunk["metadata"] for chunk in chunks]
+        print("Creating embeddings...")
+        self.embeddings = self.model.encode(self.documents, show_progress_bar=True)
+        print(f"Loaded {len(self.documents)} chunks from documents")
     def search(self, query: str, top_k: int = 5) -> List[Dict]:
         query_embedding = self.model.encode(query)
         similarities = np.dot(self.embeddings, query_embedding) / (
             np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
         )
         top_indices = np.argsort(similarities)[-top_k:][::-1]
         results = []
         return results
 # Initialize the RAG system
+rag = DocumentRAG()
+rag.load_documents()
+def preview_document(source, page=1):
+    if not source.lower().endswith('.pdf'):
+        return "Preview only available for PDF documents"
+    try:
+        doc = fitz.open(os.path.join("documents", source))
+        if 1 <= page <= len(doc):
+            page_content = doc[page-1]
+            # Convert page to image
+            pix = page_content.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better quality
+            img_path = f"temp_{source}_{page}.png"
+            pix.save(img_path)
+            return img_path
+        else:
+            return "Invalid page number"
+    except Exception as e:
+        return f"Error previewing document: {e}"
+def search_documents(query, top_k=5, include_preview=True):
     if not query.strip():
+        return "Please enter a query", None
     results = rag.search(query, top_k)
     output = ""
+    preview_path = None
+    for i, result in enumerate(results, 1):
         metadata = result["metadata"]
         score_percentage = round(result["score"] * 100)
+        output += f"\n\n📄 Document: {metadata['source']}\n"
+        if metadata['type'] == 'pdf':
+            output += f"📍 Page {metadata['preview']['page']}/{metadata['preview']['total_pages']}"
+        output += f" • Relevance: {score_percentage}%\n"
         output += f"───────────────────\n{result['content']}\n"
+        # Get preview for the first PDF result if requested
+        if i == 1 and include_preview and metadata['type'] == 'pdf':
+            preview_path = preview_document(metadata['source'], metadata['preview']['page'])
+    return output, preview_path
 # Create Gradio interface
 interface = gr.Interface(
             value=5,
             step=1,
             label="Number of results"
+        ),
+        gr.Checkbox(
+            label="Show document preview",
+            value=True
+        )
+    ],
+    outputs=[
+        gr.Textbox(
+            label="Search Results",
+            lines=20
+        ),
+        gr.Image(
+            label="Document Preview",
+            type="filepath"
         )
     ],
+    title="Document Search",
+    description="Search through PDFs and Word documents. Enter your question to find relevant content.",
     theme="default",
     allow_flagging="never",
     examples=[
+        ["What is the main topic discussed in the documents?"],
+        ["Can you find specific examples of...?"],
     ]
 )