abakerdp commited on
Commit
6a6e1c8
Β·
verified Β·
1 Parent(s): 873c87e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -56
app.py CHANGED
@@ -1,75 +1,90 @@
1
  import gradio as gr
2
- import chromadb
3
- from chromadb.config import Settings
4
  from sentence_transformers import SentenceTransformer
5
  import json
6
  from pathlib import Path
 
 
7
 
8
- # Initialize ChromaDB and model
9
- chroma_client = chromadb.Client(Settings(
10
- chroma_db_impl="duckdb+parquet",
11
- persist_directory="db"
12
- ))
13
- model = SentenceTransformer('all-MiniLM-L6-v2')
14
-
15
- def initialize_database():
16
- # Load documents from JSON file
17
- docs_path = Path("documents.json")
18
- with open(docs_path) as f:
19
- documents = json.load(f)
20
-
21
- # Create or get collection
22
- collection_name = "knowledge_base"
23
- try:
24
- collection = chroma_client.get_collection(collection_name)
25
- except:
26
- collection = chroma_client.create_collection(name=collection_name)
27
 
28
- # Add documents to collection
29
- texts = [doc["content"] for doc in documents]
30
- metadata = [{"title": doc["title"], "source": doc.get("source", ""), "section": doc.get("section", "")}
31
- for doc in documents]
32
- embeddings = model.encode(texts).tolist()
33
- ids = [str(i) for i in range(len(texts))]
34
 
35
- collection.add(
36
- documents=texts,
37
- embeddings=embeddings,
38
- metadatas=metadata,
39
- ids=ids
 
 
40
  )
41
-
42
- return collection
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Initialize database
45
- collection = initialize_database()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  def search_documents(query, top_k=5):
48
  if not query.strip():
49
  return "Please enter a query"
50
-
51
- # Generate embedding for query
52
- query_embedding = model.encode(query).tolist()
53
 
54
- # Query the collection
55
- results = collection.query(
56
- query_embeddings=[query_embedding],
57
- n_results=top_k,
58
- include=["documents", "metadatas", "distances"]
59
- )
60
 
61
- # Format results
62
  output = ""
63
- for doc, metadata, distance in zip(
64
- results["documents"][0],
65
- results["metadatas"][0],
66
- results["distances"][0]
67
- ):
68
- relevance = round((1 - (distance / 2)) * 100)
69
  output += f"\n\nπŸ“š {metadata['title']}\n"
70
- output += f"πŸ“ {metadata['source']} β€’ {metadata['section']} β€’ Relevance: {relevance}%\n"
71
- output += f"───────────────────\n{doc}\n"
72
-
73
  return output
74
 
75
  # Create Gradio interface
@@ -96,7 +111,11 @@ interface = gr.Interface(
96
  title="Knowledge Base Search",
97
  description="Ask questions about your documents and get relevant answers.",
98
  theme="default",
99
- allow_flagging="never"
 
 
 
 
100
  )
101
 
102
  # Launch the app
 
1
  import gradio as gr
 
 
2
  from sentence_transformers import SentenceTransformer
3
  import json
4
  from pathlib import Path
5
+ import numpy as np
6
+ from typing import List, Dict
7
 
8
+ class SimpleRAG:
9
+ def __init__(self):
10
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
11
+ self.documents = []
12
+ self.embeddings = []
13
+ self.metadata = []
14
+
15
+ def load_documents(self, filepath: str):
16
+ with open(filepath) as f:
17
+ data = json.load(f)
18
+
19
+ for doc in data["documents"]:
20
+ self.documents.append(doc["content"])
21
+ self.metadata.append({
22
+ "title": doc["title"],
23
+ "source": doc.get("source", "Unknown"),
24
+ "section": doc.get("section", "General")
25
+ })
 
26
 
27
+ # Create embeddings for all documents
28
+ self.embeddings = self.model.encode(self.documents)
 
 
 
 
29
 
30
+ def search(self, query: str, top_k: int = 5) -> List[Dict]:
31
+ # Get query embedding
32
+ query_embedding = self.model.encode(query)
33
+
34
+ # Calculate similarities
35
+ similarities = np.dot(self.embeddings, query_embedding) / (
36
+ np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
37
  )
38
+
39
+ # Get top results
40
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
41
+
42
+ results = []
43
+ for idx in top_indices:
44
+ results.append({
45
+ "content": self.documents[idx],
46
+ "metadata": self.metadata[idx],
47
+ "score": float(similarities[idx])
48
+ })
49
+
50
+ return results
51
 
52
+ # Initialize the RAG system
53
+ rag = SimpleRAG()
54
+ try:
55
+ rag.load_documents("documents.json")
56
+ except Exception as e:
57
+ print(f"Error loading documents: {e}")
58
+ # Load a sample document if the file doesn't exist
59
+ sample_data = {
60
+ "documents": [
61
+ {
62
+ "title": "Sample Document",
63
+ "content": "This is a sample document. Please add your own documents.json file to see real content.",
64
+ "source": "Sample",
65
+ "section": "Test"
66
+ }
67
+ ]
68
+ }
69
+ with open("documents.json", "w") as f:
70
+ json.dump(sample_data, f)
71
+ rag.load_documents("documents.json")
72
 
73
  def search_documents(query, top_k=5):
74
  if not query.strip():
75
  return "Please enter a query"
 
 
 
76
 
77
+ results = rag.search(query, top_k)
 
 
 
 
 
78
 
79
+ # Format output
80
  output = ""
81
+ for result in results:
82
+ metadata = result["metadata"]
83
+ score_percentage = round(result["score"] * 100)
 
 
 
84
  output += f"\n\nπŸ“š {metadata['title']}\n"
85
+ output += f"πŸ“ {metadata['source']} β€’ {metadata['section']} β€’ Relevance: {score_percentage}%\n"
86
+ output += f"───────────────────\n{result['content']}\n"
87
+
88
  return output
89
 
90
  # Create Gradio interface
 
111
  title="Knowledge Base Search",
112
  description="Ask questions about your documents and get relevant answers.",
113
  theme="default",
114
+ allow_flagging="never",
115
+ examples=[
116
+ ["What is machine learning?"],
117
+ ["How does this work?"],
118
+ ]
119
  )
120
 
121
  # Launch the app