abubakaraabi786 commited on
Commit
e6a07a2
·
verified ·
1 Parent(s): f9251fe

Remove the Error

Browse files
Files changed (1) hide show
  1. app.py +83 -50
app.py CHANGED
@@ -1,17 +1,19 @@
1
- import gradio as gr
2
  import os
3
  import tempfile
4
  from typing import List
 
 
5
  import PyPDF2
6
- from sentence_transformers import SentenceTransformer
7
  import numpy as np
8
- from sklearn.metrics.pairwise import cosine_similarity
9
 
 
 
10
  from groq import Groq
11
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
12
- # -----------------------------
13
  # Configuration
14
- # -----------------------------
 
15
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
16
 
17
  if not GROQ_API_KEY:
@@ -22,73 +24,84 @@ if not GROQ_API_KEY:
22
  MODEL_NAME = "llama-3.1-8b-instant"
23
 
24
  client = Groq(api_key=GROQ_API_KEY)
 
25
 
26
-
27
- # -----------------------------
28
  # PDF Processing
29
- # -----------------------------
 
30
  def extract_text_from_pdfs(files: List[tempfile.NamedTemporaryFile]):
31
  documents = []
 
32
  for file in files:
33
  reader = PyPDF2.PdfReader(file)
34
  for page_num, page in enumerate(reader.pages):
35
  text = page.extract_text()
36
- if text:
37
  documents.append({
38
  "text": text,
39
  "source": f"{os.path.basename(file.name)} - page {page_num + 1}"
40
  })
 
41
  return documents
42
 
43
- # -----------------------------
44
  # Chunking
45
- # -----------------------------
 
46
  def chunk_text(documents, chunk_size=500, overlap=50):
47
  chunks = []
 
48
  for doc in documents:
49
- text = doc["text"]
50
- words = text.split()
51
  start = 0
 
52
  while start < len(words):
53
  chunk_words = words[start:start + chunk_size]
54
- chunk_text = " ".join(chunk_words)
 
55
  chunks.append({
56
- "text": chunk_text,
57
  "source": doc["source"]
58
  })
 
59
  start += chunk_size - overlap
 
60
  return chunks
61
 
62
- # -----------------------------
63
  # Embeddings & Retrieval
64
- # -----------------------------
 
65
  def embed_chunks(chunks):
66
  texts = [c["text"] for c in chunks]
67
- embeddings = embedder.encode(texts)
68
- return embeddings
69
 
70
  def retrieve_relevant_chunks(query, chunks, embeddings, top_k=3):
71
  query_embedding = embedder.encode([query])
72
  similarities = cosine_similarity(query_embedding, embeddings)[0]
 
73
  top_indices = np.argsort(similarities)[-top_k:][::-1]
74
 
75
- results = []
76
- for idx in top_indices:
77
- results.append(chunks[idx])
78
- return results
79
 
80
- # -----------------------------
81
  # LLM Call
82
- # -----------------------------
83
- def ask_llm(question, context, history):
84
- messages = history.copy()
85
 
86
- system_prompt = (
87
- "You are a helpful assistant. Answer the question strictly using the provided context. "
88
- "If the answer is not in the context, say so."
89
- )
 
 
 
 
 
 
 
 
90
 
91
- messages.insert(0, {"role": "system", "content": system_prompt})
92
  messages.append({
93
  "role": "user",
94
  "content": f"Context:\n{context}\n\nQuestion:\n{question}"
@@ -102,14 +115,27 @@ def ask_llm(question, context, history):
102
 
103
  return response.choices[0].message.content
104
 
105
- # -----------------------------
106
- # Main Chat Logic
107
- # -----------------------------
 
108
  def chat(files, question, chat_history):
109
  if not files:
110
- return chat_history, "Please upload PDF files first."
 
 
 
 
111
 
112
  documents = extract_text_from_pdfs(files)
 
 
 
 
 
 
 
 
113
  chunks = chunk_text(documents)
114
  embeddings = embed_chunks(chunks)
115
 
@@ -117,6 +143,7 @@ def chat(files, question, chat_history):
117
 
118
  context = ""
119
  sources = []
 
120
  for c in relevant_chunks:
121
  context += c["text"] + "\n\n"
122
  sources.append(c["source"])
@@ -124,19 +151,26 @@ def chat(files, question, chat_history):
124
  answer = ask_llm(question, context, chat_history)
125
 
126
  answer_with_sources = (
127
- f"{answer}\n\n"
128
- f"Sources:\n" + "\n".join(set(sources))
129
  )
130
 
131
- chat_history.append({"role": "user", "content": question})
132
- chat_history.append({"role": "assistant", "content": answer_with_sources})
 
 
133
 
134
- return chat_history, answer_with_sources
 
 
 
 
 
135
 
136
- # -----------------------------
137
  # Gradio UI
138
- # -----------------------------
139
- with gr.Blocks(title="Enhanced RAG Chatbot") as demo:
 
140
  gr.Markdown("## 📚 Enhanced RAG-Based Chatbot (PDF QA)")
141
  gr.Markdown("Upload multiple PDFs and ask questions based on their content.")
142
 
@@ -147,16 +181,15 @@ with gr.Blocks(title="Enhanced RAG Chatbot") as demo:
147
  )
148
 
149
  chatbot = gr.Chatbot(type="messages")
150
- question = gr.Textbox(label="Ask a question")
151
  state = gr.State([])
152
 
153
- submit = gr.Button("Ask")
154
 
155
- submit.click(
156
  fn=chat,
157
  inputs=[pdf_files, question, state],
158
- outputs=[chatbot, chatbot]
159
  )
160
 
161
-
162
  demo.launch()
 
 
1
  import os
2
  import tempfile
3
  from typing import List
4
+
5
+ import gradio as gr
6
  import PyPDF2
 
7
  import numpy as np
 
8
 
9
+ from sentence_transformers import SentenceTransformer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
  from groq import Groq
12
+
13
+ # --------------------------------------------------
14
  # Configuration
15
+ # --------------------------------------------------
16
+
17
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
18
 
19
  if not GROQ_API_KEY:
 
24
  MODEL_NAME = "llama-3.1-8b-instant"
25
 
26
  client = Groq(api_key=GROQ_API_KEY)
27
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
28
 
29
+ # --------------------------------------------------
 
30
  # PDF Processing
31
+ # --------------------------------------------------
32
+
33
  def extract_text_from_pdfs(files: List[tempfile.NamedTemporaryFile]):
34
  documents = []
35
+
36
  for file in files:
37
  reader = PyPDF2.PdfReader(file)
38
  for page_num, page in enumerate(reader.pages):
39
  text = page.extract_text()
40
+ if text and text.strip():
41
  documents.append({
42
  "text": text,
43
  "source": f"{os.path.basename(file.name)} - page {page_num + 1}"
44
  })
45
+
46
  return documents
47
 
48
+ # --------------------------------------------------
49
  # Chunking
50
+ # --------------------------------------------------
51
+
52
  def chunk_text(documents, chunk_size=500, overlap=50):
53
  chunks = []
54
+
55
  for doc in documents:
56
+ words = doc["text"].split()
 
57
  start = 0
58
+
59
  while start < len(words):
60
  chunk_words = words[start:start + chunk_size]
61
+ chunk = " ".join(chunk_words)
62
+
63
  chunks.append({
64
+ "text": chunk,
65
  "source": doc["source"]
66
  })
67
+
68
  start += chunk_size - overlap
69
+
70
  return chunks
71
 
72
+ # --------------------------------------------------
73
  # Embeddings & Retrieval
74
+ # --------------------------------------------------
75
+
76
  def embed_chunks(chunks):
77
  texts = [c["text"] for c in chunks]
78
+ return embedder.encode(texts)
 
79
 
80
  def retrieve_relevant_chunks(query, chunks, embeddings, top_k=3):
81
  query_embedding = embedder.encode([query])
82
  similarities = cosine_similarity(query_embedding, embeddings)[0]
83
+
84
  top_indices = np.argsort(similarities)[-top_k:][::-1]
85
 
86
+ return [chunks[i] for i in top_indices]
 
 
 
87
 
88
+ # --------------------------------------------------
89
  # LLM Call
90
+ # --------------------------------------------------
 
 
91
 
92
+ def ask_llm(question, context, history):
93
+ messages = [
94
+ {
95
+ "role": "system",
96
+ "content": (
97
+ "You are a helpful assistant. Answer the question strictly using the provided context. "
98
+ "If the answer is not contained in the context, say so clearly."
99
+ )
100
+ }
101
+ ]
102
+
103
+ messages.extend(history)
104
 
 
105
  messages.append({
106
  "role": "user",
107
  "content": f"Context:\n{context}\n\nQuestion:\n{question}"
 
115
 
116
  return response.choices[0].message.content
117
 
118
+ # --------------------------------------------------
119
+ # Chat Logic (Gradio Compatible)
120
+ # --------------------------------------------------
121
+
122
  def chat(files, question, chat_history):
123
  if not files:
124
+ chat_history.append({
125
+ "role": "assistant",
126
+ "content": "❌ Please upload at least one PDF file."
127
+ })
128
+ return chat_history
129
 
130
  documents = extract_text_from_pdfs(files)
131
+
132
+ if not documents:
133
+ chat_history.append({
134
+ "role": "assistant",
135
+ "content": "❌ Could not extract text from the uploaded PDF(s)."
136
+ })
137
+ return chat_history
138
+
139
  chunks = chunk_text(documents)
140
  embeddings = embed_chunks(chunks)
141
 
 
143
 
144
  context = ""
145
  sources = []
146
+
147
  for c in relevant_chunks:
148
  context += c["text"] + "\n\n"
149
  sources.append(c["source"])
 
151
  answer = ask_llm(question, context, chat_history)
152
 
153
  answer_with_sources = (
154
+ f"{answer}\n\n📄 Sources:\n" + "\n".join(set(sources))
 
155
  )
156
 
157
+ chat_history.append({
158
+ "role": "user",
159
+ "content": question
160
+ })
161
 
162
+ chat_history.append({
163
+ "role": "assistant",
164
+ "content": answer_with_sources
165
+ })
166
+
167
+ return chat_history
168
 
169
+ # --------------------------------------------------
170
  # Gradio UI
171
+ # --------------------------------------------------
172
+
173
+ with gr.Blocks(title="Enhanced RAG-Based Chatbot") as demo:
174
  gr.Markdown("## 📚 Enhanced RAG-Based Chatbot (PDF QA)")
175
  gr.Markdown("Upload multiple PDFs and ask questions based on their content.")
176
 
 
181
  )
182
 
183
  chatbot = gr.Chatbot(type="messages")
184
+ question = gr.Textbox(label="Ask a question", placeholder="Type your question here...")
185
  state = gr.State([])
186
 
187
+ ask_button = gr.Button("Ask")
188
 
189
+ ask_button.click(
190
  fn=chat,
191
  inputs=[pdf_files, question, state],
192
+ outputs=chatbot
193
  )
194
 
 
195
  demo.launch()