yashalhussain commited on
Commit
31fc4f4
Β·
verified Β·
1 Parent(s): 32d2ff9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import PyPDF2
4
+ import requests
5
+
6
+ # ================= CONFIG =================
7
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
8
+ GROQ_MODEL = "llama-3.1-8b-instant"
9
+ GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"
10
+
11
+ processed_texts = {}
12
+
13
+ # ================= PDF HANDLING =================
14
+ def extract_pdf_text(file):
15
+ reader = PyPDF2.PdfReader(file)
16
+ text = ""
17
+ for page in reader.pages:
18
+ text += (page.extract_text() or "") + "\n"
19
+ return text
20
+
21
+
22
+ def chunk_text(text, chunk_size=400, overlap=50):
23
+ words = text.split()
24
+ chunks = []
25
+ i = 0
26
+ while i < len(words):
27
+ chunk = " ".join(words[i:i + chunk_size])
28
+ chunks.append(chunk)
29
+ i += chunk_size - overlap
30
+ return chunks
31
+
32
+
33
+ def preview_documents(files):
34
+ processed_texts.clear()
35
+ rows = []
36
+
37
+ for f in files:
38
+ text = extract_pdf_text(f)
39
+ name = os.path.basename(f)
40
+
41
+ chunks = chunk_text(text)
42
+ processed_texts[name] = chunks
43
+
44
+ rows.append([
45
+ name,
46
+ len(text.split()),
47
+ text[:300],
48
+ f"{len(chunks)} chunks"
49
+ ])
50
+
51
+ return rows
52
+
53
+
54
+ def process_documents(files):
55
+ if not processed_texts:
56
+ return "❌ Preview documents first."
57
+ return f"βœ… {len(processed_texts)} document(s) processed."
58
+
59
+
60
+ # ================= GROQ CALL =================
61
+ def query_groq(prompt):
62
+ if not GROQ_API_KEY:
63
+ return "❌ GROQ_API_KEY not set."
64
+
65
+ headers = {
66
+ "Authorization": f"Bearer {GROQ_API_KEY}",
67
+ "Content-Type": "application/json"
68
+ }
69
+
70
+ payload = {
71
+ "model": GROQ_MODEL,
72
+ "messages": [
73
+ {"role": "system", "content": "Answer ONLY from the provided documents."},
74
+ {"role": "user", "content": prompt}
75
+ ],
76
+ "temperature": 0.1,
77
+ "max_tokens": 400
78
+ }
79
+
80
+ r = requests.post(GROQ_URL, headers=headers, json=payload, timeout=30)
81
+
82
+ if r.status_code == 200:
83
+ return r.json()["choices"][0]["message"]["content"]
84
+
85
+ return f"❌ Groq Error {r.status_code}: {r.text}"
86
+
87
+
88
+ # ================= RAG =================
89
+ def retrieve_context(question):
90
+ question_words = set(question.lower().split())
91
+ best_chunk = ""
92
+ best_score = 0
93
+
94
+ for chunks in processed_texts.values():
95
+ for chunk in chunks:
96
+ chunk_words = set(chunk.lower().split())
97
+ score = len(question_words & chunk_words)
98
+ if score > best_score:
99
+ best_score = score
100
+ best_chunk = chunk
101
+
102
+ return best_chunk[:1000] if best_chunk else ""
103
+
104
+
105
+ def answer_question(question, history):
106
+ if history is None:
107
+ history = []
108
+
109
+ # πŸ”Ή STEP 1: show user message instantly
110
+ history.append((question, ""))
111
+
112
+ if not processed_texts:
113
+ history[-1] = (question, "⚠️ Upload and process PDFs first.")
114
+ return history, ""
115
+
116
+ context = retrieve_context(question)
117
+
118
+ if not context:
119
+ history[-1] = (question, "❌ No relevant information found in documents.")
120
+ return history, ""
121
+
122
+ prompt = f"""
123
+ DOCUMENT CONTEXT:
124
+ {context}
125
+
126
+ QUESTION:
127
+ {question}
128
+
129
+ Answer clearly using the document context only.
130
+ """
131
+
132
+ # πŸ”Ή STEP 2: get model response
133
+ answer = query_groq(prompt)
134
+
135
+ # πŸ”Ή STEP 3: replace last empty reply
136
+ history[-1] = (question, answer)
137
+
138
+ return history, ""
139
+
140
+
141
+ # ================= UI =================
142
+ with gr.Blocks(title="RAG PDF Chatbot") as demo:
143
+ gr.Markdown("# πŸ“š RAG PDF Chatbot (Groq)")
144
+ gr.Markdown("*Upload PDFs β†’ Preview β†’ Ask questions*")
145
+
146
+ with gr.Row():
147
+ files = gr.File(file_types=[".pdf"], file_count="multiple")
148
+ preview_btn = gr.Button("πŸ“„ Preview")
149
+ process_btn = gr.Button("πŸš€ Process")
150
+ status = gr.Textbox(label="Status")
151
+
152
+ table = gr.DataFrame(
153
+ headers=["File", "Words", "Preview", "Chunks"],
154
+ interactive=False
155
+ )
156
+
157
+ chatbot = gr.Chatbot(height=420)
158
+
159
+ msg = gr.Textbox(
160
+ placeholder="Ask a question from the documents...",
161
+ lines=2
162
+ )
163
+ send = gr.Button("Send")
164
+
165
+ preview_btn.click(preview_documents, files, table)
166
+ process_btn.click(process_documents, files, status)
167
+
168
+ send.click(answer_question, [msg, chatbot], [chatbot, msg])
169
+ msg.submit(answer_question, [msg, chatbot], [chatbot, msg])
170
+
171
+ demo.launch()