Spaces:

abakerdp
/

RAGtimeSearch

Sleeping

App Files Files Community

abakerdp commited on Nov 10, 2024

Commit

0cded56

verified ·

1 Parent(s): 4d84223

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -230

app.py CHANGED Viewed

@@ -1,243 +1,103 @@
 import gradio as gr
 from sentence_transformers import SentenceTransformer
-import numpy as np
-from typing import List, Dict
-import PyPDF2
-import docx
-import os
-from pathlib import Path
-import json
-import fitz  # PyMuPDF for better PDF handling
-import re
-from tqdm import tqdm
-class DocumentProcessor:
-    def __init__(self, docs_dir="documents"):
-        self.docs_dir = docs_dir
-    def extract_text_from_pdf(self, file_path):
-        try:
-            doc = fitz.open(file_path)
-            text_chunks = []
-            for page_num, page in enumerate(doc):
-                # Extract text
-                text = page.get_text()
-                # Get page dimensions for preview coordinates
-                preview = {
-                    "page": page_num + 1,
-                    "total_pages": len(doc),
-                }
-                # Split into chunks (~ 500 chars each)
-                chunks = self.split_into_chunks(text)
-                for chunk in chunks:
-                    text_chunks.append({
-                        "content": chunk,
-                        "metadata": {
-                            "source": os.path.basename(file_path),
-                            "type": "pdf",
-                            "preview": preview
-                        }
-                    })
-            return text_chunks
-        except Exception as e:
-            print(f"Error processing PDF {file_path}: {e}")
-            return []
-    def extract_text_from_docx(self, file_path):
-        try:
-            doc = docx.Document(file_path)
-            text_chunks = []
-            full_text = ""
-            for para in doc.paragraphs:
-                full_text += para.text + "\n"
-            chunks = self.split_into_chunks(full_text)
-            for chunk in chunks:
-                text_chunks.append({
-                    "content": chunk,
-                    "metadata": {
-                        "source": os.path.basename(file_path),
-                        "type": "docx"
-                    }
-                })
-            return text_chunks
-        except Exception as e:
-            print(f"Error processing DOCX {file_path}: {e}")
-            return []
-    def split_into_chunks(self, text, chunk_size=500, overlap=50):
-        chunks = []
-        start = 0
-        text_length = len(text)
-        while start < text_length:
-            end = start + chunk_size
-            # Adjust chunk end to nearest sentence or paragraph break
-            if end < text_length:
-                # Look for sentence endings (.!?) followed by space or newline
-                match = re.search(r'[.!?]\s+', text[end-50:end+50])
-                if match:
-                    end = end - 50 + match.end()
-            chunk = text[start:end].strip()
-            if chunk:  # Only add non-empty chunks
-                chunks.append(chunk)
-            start = end - overlap
-        return chunks
-    def process_all_documents(self):
-        all_chunks = []
-        if not os.path.exists(self.docs_dir):
-            os.makedirs(self.docs_dir)
-            print(f"Created documents directory at {self.docs_dir}")
-            return all_chunks
-        for file_name in tqdm(os.listdir(self.docs_dir)):
-            file_path = os.path.join(self.docs_dir, file_name)
-            if file_name.lower().endswith('.pdf'):
-                chunks = self.extract_text_from_pdf(file_path)
-                all_chunks.extend(chunks)
-            elif file_name.lower().endswith('.docx'):
-                chunks = self.extract_text_from_docx(file_path)
-                all_chunks.extend(chunks)
-        return all_chunks
-class DocumentRAG:
-    def __init__(self):
-        self.model = SentenceTransformer('all-MiniLM-L6-v2')
-        self.documents = []
-        self.embeddings = []
-        self.metadata = []
-        self.processor = DocumentProcessor()
-    def load_documents(self):
-        print("Processing documents...")
-        chunks = self.processor.process_all_documents()
-        self.documents = [chunk["content"] for chunk in chunks]
-        self.metadata = [chunk["metadata"] for chunk in chunks]
-        print("Creating embeddings...")
-        self.embeddings = self.model.encode(self.documents, show_progress_bar=True)
-        print(f"Loaded {len(self.documents)} chunks from documents")
-    def search(self, query: str, top_k: int = 5) -> List[Dict]:
-        query_embedding = self.model.encode(query)
-        similarities = np.dot(self.embeddings, query_embedding) / (
-            np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
-        )
-        top_indices = np.argsort(similarities)[-top_k:][::-1]
-        results = []
-        for idx in top_indices:
-            results.append({
-                "content": self.documents[idx],
-                "metadata": self.metadata[idx],
-                "score": float(similarities[idx])
-            })
-        return results
-# Initialize the RAG system
-rag = DocumentRAG()
-rag.load_documents()
-def preview_document(source, page=1):
-    if not source.lower().endswith('.pdf'):
-        return "Preview only available for PDF documents"
-    try:
-        doc = fitz.open(os.path.join("documents", source))
-        if 1 <= page <= len(doc):
-            page_content = doc[page-1]
-            # Convert page to image
-            pix = page_content.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better quality
-            img_path = f"temp_{source}_{page}.png"
-            pix.save(img_path)
-            return img_path
-        else:
-            return "Invalid page number"
-    except Exception as e:
-        return f"Error previewing document: {e}"
-def search_documents(query, top_k=5, include_preview=True):
-    if not query.strip():
-        return "Please enter a query", None
-    results = rag.search(query, top_k)
-    output = ""
-    preview_path = None
-    for i, result in enumerate(results, 1):
-        metadata = result["metadata"]
-        score_percentage = round(result["score"] * 100)
-        output += f"\n\n📄 Document: {metadata['source']}\n"
-        if metadata['type'] == 'pdf':
-            output += f"📍 Page {metadata['preview']['page']}/{metadata['preview']['total_pages']}"
-        output += f" • Relevance: {score_percentage}%\n"
-        output += f"───────────────────\n{result['content']}\n"
-        # Get preview for the first PDF result if requested
-        if i == 1 and include_preview and metadata['type'] == 'pdf':
-            preview_path = preview_document(metadata['source'], metadata['preview']['page'])
-    return output, preview_path
-# Create Gradio interface
-interface = gr.Interface(
-    fn=search_documents,
-    inputs=[
-        gr.Textbox(
-            lines=2,
-            placeholder="Enter your question here...",
-            label="Question"
-        ),
-        gr.Slider(
-            minimum=1,
-            maximum=10,
-            value=5,
-            step=1,
-            label="Number of results"
-        ),
-        gr.Checkbox(
-            label="Show document preview",
-            value=True
-        )
-    ],
-    outputs=[
-        gr.Textbox(
-            label="Search Results",
-            lines=20
-        ),
-        gr.Image(
-            label="Document Preview",
-            type="filepath"
-        )
-    ],
-    title="Document Search",
-    description="Search through PDFs and Word documents. Enter your question to find relevant content.",
-    theme="default",
-    allow_flagging="never",
-    examples=[
-        ["What is the main topic discussed in the documents?"],
-        ["Can you find specific examples of...?"],
-    ]
-)
-# Launch the app
-if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
+import pinecone
 from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqGeneration
+import torch
+from datasets import load_dataset
+# Initialize models and databases
+def init_models():
+    # Load the embedding model
+    embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
+    # Load the LLM for answering
+    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
+    model = AutoModelForSeq2SeqGeneration.from_pretrained("google/flan-t5-base")
+    # Initialize Pinecone
+    pinecone.init(api_key="your-pinecone-api-key", environment="gcp-starter")
+    index = pinecone.Index("test-index")
+    # Load your dataset from Hugging Face
+    dataset = load_dataset("your-username/your-dataset-name", split="train")
+    return embeddings_model, tokenizer, model, index, dataset
+# Generate response using retrieved context
+def generate_answer(question, context, tokenizer, model):
+    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
+    outputs = model.generate(
+        **inputs,
+        max_length=512,
+        num_beams=4,
+        temperature=0.7,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        early_stopping=True
+    )
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return answer
+def search_documents(query, embeddings_model, index, dataset, top_k=3):
+    # Create embedding for the query
+    query_embedding = embeddings_model.encode(query)
+    # Search Pinecone
+    results = index.query(
+        vector=query_embedding.tolist(),
+        top_k=top_k,
+        include_metadata=True
+    )
+    # Get full context from the dataset using metadata
+    contexts = []
+    for match in results.matches:
+        source = match.metadata['source']
+        # Find the corresponding document in the dataset
+        doc = next((item for item in dataset if item['source'] == source), None)
+        if doc:
+            contexts.append(doc['text'])
+    return "\n\n".join(contexts)
+# Initialize all models and databases
+embeddings_model, tokenizer, model, index, dataset = init_models()
+def process_query(query):
+    # Search for relevant documents
+    context = search_documents(query, embeddings_model, index, dataset)
+    # Generate answer
+    answer = generate_answer(query, context, tokenizer, model)
+    # Format sources
+    sources = [f"Source: {match.metadata['source']}" for match in index.query(
+        vector=embeddings_model.encode(query).tolist(),
+        top_k=3,
+        include_metadata=True
+    ).matches]
+    return answer, "\n".join(sources)
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Document Search and Q&A")
+    with gr.Row():
+        query_input = gr.Textbox(label="Enter your question")
+        search_button = gr.Button("Search")
+    with gr.Row():
+        answer_output = gr.Textbox(label="Answer")
+        sources_output = gr.Textbox(label="Sources")
+    search_button.click(
+        process_query,
+        inputs=[query_input],
+        outputs=[answer_output, sources_output]
+    )
+demo.launch()