Spaces:

abakerdp
/

RAGtimeSearch

Sleeping

App Files Files Community

abakerdp commited on Nov 10, 2024

Commit

6a6e1c8

verified ·

1 Parent(s): 873c87e

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -56

app.py CHANGED Viewed

@@ -1,75 +1,90 @@
 import gradio as gr
-import chromadb
-from chromadb.config import Settings
 from sentence_transformers import SentenceTransformer
 import json
 from pathlib import Path
-# Initialize ChromaDB and model
-chroma_client = chromadb.Client(Settings(
-    chroma_db_impl="duckdb+parquet",
-    persist_directory="db"
-))
-model = SentenceTransformer('all-MiniLM-L6-v2')
-def initialize_database():
-    # Load documents from JSON file
-    docs_path = Path("documents.json")
-    with open(docs_path) as f:
-        documents = json.load(f)
-    # Create or get collection
-    collection_name = "knowledge_base"
-    try:
-        collection = chroma_client.get_collection(collection_name)
-    except:
-        collection = chroma_client.create_collection(name=collection_name)
-        # Add documents to collection
-        texts = [doc["content"] for doc in documents]
-        metadata = [{"title": doc["title"], "source": doc.get("source", ""), "section": doc.get("section", "")}
-                   for doc in documents]
-        embeddings = model.encode(texts).tolist()
-        ids = [str(i) for i in range(len(texts))]
-        collection.add(
-            documents=texts,
-            embeddings=embeddings,
-            metadatas=metadata,
-            ids=ids
         )
-    return collection
-# Initialize database
-collection = initialize_database()
 def search_documents(query, top_k=5):
     if not query.strip():
         return "Please enter a query"
-    # Generate embedding for query
-    query_embedding = model.encode(query).tolist()
-    # Query the collection
-    results = collection.query(
-        query_embeddings=[query_embedding],
-        n_results=top_k,
-        include=["documents", "metadatas", "distances"]
-    )
-    # Format results
     output = ""
-    for doc, metadata, distance in zip(
-        results["documents"][0],
-        results["metadatas"][0],
-        results["distances"][0]
-    ):
-        relevance = round((1 - (distance / 2)) * 100)
         output += f"\n\n📚 {metadata['title']}\n"
-        output += f"📍 {metadata['source']} • {metadata['section']} • Relevance: {relevance}%\n"
-        output += f"───────────────────\n{doc}\n"
     return output
 # Create Gradio interface
@@ -96,7 +111,11 @@ interface = gr.Interface(
     title="Knowledge Base Search",
     description="Ask questions about your documents and get relevant answers.",
     theme="default",
-    allow_flagging="never"
 )
 # Launch the app

 import gradio as gr
 from sentence_transformers import SentenceTransformer
 import json
 from pathlib import Path
+import numpy as np
+from typing import List, Dict
+class SimpleRAG:
+    def __init__(self):
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+        self.documents = []
+        self.embeddings = []
+        self.metadata = []
+    def load_documents(self, filepath: str):
+        with open(filepath) as f:
+            data = json.load(f)
+        for doc in data["documents"]:
+            self.documents.append(doc["content"])
+            self.metadata.append({
+                "title": doc["title"],
+                "source": doc.get("source", "Unknown"),
+                "section": doc.get("section", "General")
+            })
+        # Create embeddings for all documents
+        self.embeddings = self.model.encode(self.documents)
+    def search(self, query: str, top_k: int = 5) -> List[Dict]:
+        # Get query embedding
+        query_embedding = self.model.encode(query)
+        # Calculate similarities
+        similarities = np.dot(self.embeddings, query_embedding) / (
+            np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
         )
+        # Get top results
+        top_indices = np.argsort(similarities)[-top_k:][::-1]
+        results = []
+        for idx in top_indices:
+            results.append({
+                "content": self.documents[idx],
+                "metadata": self.metadata[idx],
+                "score": float(similarities[idx])
+            })
+        return results
+# Initialize the RAG system
+rag = SimpleRAG()
+try:
+    rag.load_documents("documents.json")
+except Exception as e:
+    print(f"Error loading documents: {e}")
+    # Load a sample document if the file doesn't exist
+    sample_data = {
+        "documents": [
+            {
+                "title": "Sample Document",
+                "content": "This is a sample document. Please add your own documents.json file to see real content.",
+                "source": "Sample",
+                "section": "Test"
+            }
+        ]
+    }
+    with open("documents.json", "w") as f:
+        json.dump(sample_data, f)
+    rag.load_documents("documents.json")
 def search_documents(query, top_k=5):
     if not query.strip():
         return "Please enter a query"
+    results = rag.search(query, top_k)
+    # Format output
     output = ""
+    for result in results:
+        metadata = result["metadata"]
+        score_percentage = round(result["score"] * 100)
         output += f"\n\n📚 {metadata['title']}\n"
+        output += f"📍 {metadata['source']} • {metadata['section']} • Relevance: {score_percentage}%\n"
+        output += f"───────────────────\n{result['content']}\n"
     return output
 # Create Gradio interface
     title="Knowledge Base Search",
     description="Ask questions about your documents and get relevant answers.",
     theme="default",
+    allow_flagging="never",
+    examples=[
+        ["What is machine learning?"],
+        ["How does this work?"],
+    ]
 )
 # Launch the app