Chand11 commited on
Commit
db4ca81
·
verified ·
1 Parent(s): a5911d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -26
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import gradio as gr
3
  import faiss
4
  import numpy as np
@@ -29,20 +30,37 @@ doc_names = []
29
  index = None
30
 
31
 
32
- def chunk_text(text, chunk_size=300):
 
 
 
 
33
  chunks = []
34
- for i in range(0, len(text), chunk_size):
35
- chunks.append(text[i:i+chunk_size])
 
 
 
 
 
 
 
 
 
 
36
  return chunks
37
 
38
 
 
 
 
39
  def add_document(file):
40
  global index, documents, doc_chunks, doc_names
41
 
42
  if file is None:
43
- return "No file uploaded."
44
 
45
- # Reset everything (clean workspace)
46
  index = None
47
  documents = []
48
  doc_chunks = []
@@ -56,57 +74,87 @@ def add_document(file):
56
 
57
  chunks = chunk_text(content)
58
 
 
 
 
59
  embeddings = embedder.encode(chunks)
60
  embeddings = np.array(embeddings).astype("float32")
61
 
62
  index = faiss.IndexFlatL2(embeddings.shape[1])
63
  index.add(embeddings)
64
 
 
 
65
  for chunk in chunks:
66
  doc_chunks.append(chunk)
67
- doc_names.append(file.name)
68
 
69
- documents.append(file.name)
70
 
71
- return f"Workspace reset. Uploaded: {file.name}"
72
 
73
 
 
 
 
74
  def list_documents():
75
  if not documents:
76
  return "No documents uploaded."
77
  return "\n".join(documents)
78
 
79
 
 
 
 
80
  def ask_question(question):
 
 
81
  if index is None:
82
- return "No documents available.", "", ""
83
 
84
- if not question.strip():
85
- return "Empty question.", "", ""
86
 
87
  query_embedding = embedder.encode([question])
88
  query_embedding = np.array(query_embedding).astype("float32")
89
 
90
- D, I = index.search(query_embedding, k=3)
 
 
 
91
 
92
- retrieved_chunks = [doc_chunks[i] for i in I[0]]
93
- retrieved_sources = [doc_names[i] for i in I[0]]
 
 
 
 
 
94
 
95
  context = "\n\n".join(retrieved_chunks)
96
 
97
  prompt = f"""
98
- Answer the question using only the context below.
99
- If the answer is not in the context, say you don't know.
 
 
 
 
100
 
101
- Context:
102
- {context}
103
 
104
- Question:
105
- {question}
106
- """
107
 
108
- response = llm.generate_content(prompt)
109
- answer = response.text
 
 
 
 
 
 
110
 
111
  source_display = "\n".join(set(retrieved_sources))
112
  chunk_display = "\n\n---\n\n".join(retrieved_chunks)
@@ -114,17 +162,31 @@ def ask_question(question):
114
  return answer, source_display, chunk_display
115
 
116
 
 
 
 
117
  def system_status():
118
- llm_status = "OK"
119
  db_status = "OK" if index is not None else "No documents loaded"
120
- return f"Backend: OK\nVector DB: {db_status}\nLLM: {llm_status}"
 
 
 
 
 
 
 
121
 
122
 
123
  # -------------------------
124
  # Gradio UI
125
  # -------------------------
126
  with gr.Blocks() as demo:
127
- gr.Markdown("# Mini Private Knowledge Q&A")
 
 
 
 
128
 
129
  with gr.Tab("Upload"):
130
  file_input = gr.File(file_types=[".txt"])
 
1
  import os
2
+ import re
3
  import gradio as gr
4
  import faiss
5
  import numpy as np
 
30
  index = None
31
 
32
 
33
+ # -------------------------
34
+ # Smarter Chunking
35
+ # -------------------------
36
+ def chunk_text(text, chunk_size=500):
37
+ sentences = re.split(r'(?<=[.!?]) +', text)
38
  chunks = []
39
+ current_chunk = ""
40
+
41
+ for sentence in sentences:
42
+ if len(current_chunk) + len(sentence) < chunk_size:
43
+ current_chunk += " " + sentence
44
+ else:
45
+ chunks.append(current_chunk.strip())
46
+ current_chunk = sentence
47
+
48
+ if current_chunk:
49
+ chunks.append(current_chunk.strip())
50
+
51
  return chunks
52
 
53
 
54
+ # -------------------------
55
+ # Upload Document
56
+ # -------------------------
57
  def add_document(file):
58
  global index, documents, doc_chunks, doc_names
59
 
60
  if file is None:
61
+ return "Please upload a .txt file."
62
 
63
+ # Reset workspace for clean demo behavior
64
  index = None
65
  documents = []
66
  doc_chunks = []
 
74
 
75
  chunks = chunk_text(content)
76
 
77
+ if len(chunks) == 0:
78
+ return "Uploaded file is empty."
79
+
80
  embeddings = embedder.encode(chunks)
81
  embeddings = np.array(embeddings).astype("float32")
82
 
83
  index = faiss.IndexFlatL2(embeddings.shape[1])
84
  index.add(embeddings)
85
 
86
+ clean_name = os.path.basename(file.name)
87
+
88
  for chunk in chunks:
89
  doc_chunks.append(chunk)
90
+ doc_names.append(clean_name)
91
 
92
+ documents.append(clean_name)
93
 
94
+ return f"Workspace reset. Uploaded: {clean_name}"
95
 
96
 
97
+ # -------------------------
98
+ # List Documents
99
+ # -------------------------
100
  def list_documents():
101
  if not documents:
102
  return "No documents uploaded."
103
  return "\n".join(documents)
104
 
105
 
106
+ # -------------------------
107
+ # Ask Question
108
+ # -------------------------
109
  def ask_question(question):
110
+ global index
111
+
112
  if index is None:
113
+ return "Please upload a document first.", "", ""
114
 
115
+ if question is None or question.strip() == "":
116
+ return "Please enter a valid question.", "", ""
117
 
118
  query_embedding = embedder.encode([question])
119
  query_embedding = np.array(query_embedding).astype("float32")
120
 
121
+ D, I = index.search(query_embedding, k=5)
122
+
123
+ retrieved_chunks = []
124
+ retrieved_sources = []
125
 
126
+ for idx in I[0]:
127
+ if idx < len(doc_chunks):
128
+ retrieved_chunks.append(doc_chunks[idx])
129
+ retrieved_sources.append(doc_names[idx])
130
+
131
+ if not retrieved_chunks:
132
+ return "No relevant content found.", "", ""
133
 
134
  context = "\n\n".join(retrieved_chunks)
135
 
136
  prompt = f"""
137
+ You are a strict document-based question answering system.
138
+
139
+ Use ONLY the provided context.
140
+ Do NOT use outside knowledge.
141
+ If the answer is not clearly present in the context, say exactly:
142
+ "I don't know based on the provided documents."
143
 
144
+ Context:
145
+ {context}
146
 
147
+ Question:
148
+ {question}
 
149
 
150
+ Answer clearly and concisely:
151
+ """
152
+
153
+ try:
154
+ response = llm.generate_content(prompt)
155
+ answer = response.text.strip()
156
+ except Exception as e:
157
+ return f"LLM Error: {str(e)}", "", ""
158
 
159
  source_display = "\n".join(set(retrieved_sources))
160
  chunk_display = "\n\n---\n\n".join(retrieved_chunks)
 
162
  return answer, source_display, chunk_display
163
 
164
 
165
+ # -------------------------
166
+ # System Status
167
+ # -------------------------
168
  def system_status():
169
+ backend_status = "OK"
170
  db_status = "OK" if index is not None else "No documents loaded"
171
+
172
+ try:
173
+ llm.generate_content("Say OK")
174
+ llm_status = "OK"
175
+ except Exception:
176
+ llm_status = "LLM connection failed"
177
+
178
+ return f"Backend: {backend_status}\nVector DB: {db_status}\nLLM: {llm_status}"
179
 
180
 
181
  # -------------------------
182
  # Gradio UI
183
  # -------------------------
184
  with gr.Blocks() as demo:
185
+ gr.Markdown("""
186
+ # Mini Private Knowledge Q&A Workspace
187
+
188
+ Upload a text document, ask questions, and see exactly which document and text snippet the answer comes from.
189
+ """)
190
 
191
  with gr.Tab("Upload"):
192
  file_input = gr.File(file_types=[".txt"])