Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,75 +1,90 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import chromadb
|
| 3 |
-
from chromadb.config import Settings
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import json
|
| 6 |
from pathlib import Path
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
collection = chroma_client.create_collection(name=collection_name)
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
-
metadata = [{"title": doc["title"], "source": doc.get("source", ""), "section": doc.get("section", "")}
|
| 31 |
-
for doc in documents]
|
| 32 |
-
embeddings = model.encode(texts).tolist()
|
| 33 |
-
ids = [str(i) for i in range(len(texts))]
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
)
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
# Initialize
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
def search_documents(query, top_k=5):
|
| 48 |
if not query.strip():
|
| 49 |
return "Please enter a query"
|
| 50 |
-
|
| 51 |
-
# Generate embedding for query
|
| 52 |
-
query_embedding = model.encode(query).tolist()
|
| 53 |
|
| 54 |
-
|
| 55 |
-
results = collection.query(
|
| 56 |
-
query_embeddings=[query_embedding],
|
| 57 |
-
n_results=top_k,
|
| 58 |
-
include=["documents", "metadatas", "distances"]
|
| 59 |
-
)
|
| 60 |
|
| 61 |
-
# Format
|
| 62 |
output = ""
|
| 63 |
-
for
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
results["distances"][0]
|
| 67 |
-
):
|
| 68 |
-
relevance = round((1 - (distance / 2)) * 100)
|
| 69 |
output += f"\n\nπ {metadata['title']}\n"
|
| 70 |
-
output += f"π {metadata['source']} β’ {metadata['section']} β’ Relevance: {
|
| 71 |
-
output += f"βββββββββββββββββββ\n{
|
| 72 |
-
|
| 73 |
return output
|
| 74 |
|
| 75 |
# Create Gradio interface
|
|
@@ -96,7 +111,11 @@ interface = gr.Interface(
|
|
| 96 |
title="Knowledge Base Search",
|
| 97 |
description="Ask questions about your documents and get relevant answers.",
|
| 98 |
theme="default",
|
| 99 |
-
allow_flagging="never"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
)
|
| 101 |
|
| 102 |
# Launch the app
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
| 2 |
from sentence_transformers import SentenceTransformer
|
| 3 |
import json
|
| 4 |
from pathlib import Path
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import List, Dict
|
| 7 |
|
| 8 |
+
class SimpleRAG:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 11 |
+
self.documents = []
|
| 12 |
+
self.embeddings = []
|
| 13 |
+
self.metadata = []
|
| 14 |
+
|
| 15 |
+
def load_documents(self, filepath: str):
|
| 16 |
+
with open(filepath) as f:
|
| 17 |
+
data = json.load(f)
|
| 18 |
+
|
| 19 |
+
for doc in data["documents"]:
|
| 20 |
+
self.documents.append(doc["content"])
|
| 21 |
+
self.metadata.append({
|
| 22 |
+
"title": doc["title"],
|
| 23 |
+
"source": doc.get("source", "Unknown"),
|
| 24 |
+
"section": doc.get("section", "General")
|
| 25 |
+
})
|
|
|
|
| 26 |
|
| 27 |
+
# Create embeddings for all documents
|
| 28 |
+
self.embeddings = self.model.encode(self.documents)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
def search(self, query: str, top_k: int = 5) -> List[Dict]:
|
| 31 |
+
# Get query embedding
|
| 32 |
+
query_embedding = self.model.encode(query)
|
| 33 |
+
|
| 34 |
+
# Calculate similarities
|
| 35 |
+
similarities = np.dot(self.embeddings, query_embedding) / (
|
| 36 |
+
np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
|
| 37 |
)
|
| 38 |
+
|
| 39 |
+
# Get top results
|
| 40 |
+
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
| 41 |
+
|
| 42 |
+
results = []
|
| 43 |
+
for idx in top_indices:
|
| 44 |
+
results.append({
|
| 45 |
+
"content": self.documents[idx],
|
| 46 |
+
"metadata": self.metadata[idx],
|
| 47 |
+
"score": float(similarities[idx])
|
| 48 |
+
})
|
| 49 |
+
|
| 50 |
+
return results
|
| 51 |
|
| 52 |
+
# Initialize the RAG system
|
| 53 |
+
rag = SimpleRAG()
|
| 54 |
+
try:
|
| 55 |
+
rag.load_documents("documents.json")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Error loading documents: {e}")
|
| 58 |
+
# Load a sample document if the file doesn't exist
|
| 59 |
+
sample_data = {
|
| 60 |
+
"documents": [
|
| 61 |
+
{
|
| 62 |
+
"title": "Sample Document",
|
| 63 |
+
"content": "This is a sample document. Please add your own documents.json file to see real content.",
|
| 64 |
+
"source": "Sample",
|
| 65 |
+
"section": "Test"
|
| 66 |
+
}
|
| 67 |
+
]
|
| 68 |
+
}
|
| 69 |
+
with open("documents.json", "w") as f:
|
| 70 |
+
json.dump(sample_data, f)
|
| 71 |
+
rag.load_documents("documents.json")
|
| 72 |
|
| 73 |
def search_documents(query, top_k=5):
|
| 74 |
if not query.strip():
|
| 75 |
return "Please enter a query"
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
+
results = rag.search(query, top_k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
# Format output
|
| 80 |
output = ""
|
| 81 |
+
for result in results:
|
| 82 |
+
metadata = result["metadata"]
|
| 83 |
+
score_percentage = round(result["score"] * 100)
|
|
|
|
|
|
|
|
|
|
| 84 |
output += f"\n\nπ {metadata['title']}\n"
|
| 85 |
+
output += f"π {metadata['source']} β’ {metadata['section']} β’ Relevance: {score_percentage}%\n"
|
| 86 |
+
output += f"βββββββββββββββββββ\n{result['content']}\n"
|
| 87 |
+
|
| 88 |
return output
|
| 89 |
|
| 90 |
# Create Gradio interface
|
|
|
|
| 111 |
title="Knowledge Base Search",
|
| 112 |
description="Ask questions about your documents and get relevant answers.",
|
| 113 |
theme="default",
|
| 114 |
+
allow_flagging="never",
|
| 115 |
+
examples=[
|
| 116 |
+
["What is machine learning?"],
|
| 117 |
+
["How does this work?"],
|
| 118 |
+
]
|
| 119 |
)
|
| 120 |
|
| 121 |
# Launch the app
|