afshanramzan commited on
Commit
bc85c93
·
verified ·
1 Parent(s): 9ab2da1

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +155 -0
  2. requirments.txt +6 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from pypdf import PdfReader
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import numpy as np
7
+ from groq import Groq
8
+
9
+ # -----------------------
10
+ # Initialize embedding model
11
+ # -----------------------
12
+ model = SentenceTransformer("all-MiniLM-L6-v2")
13
+
14
+ # -----------------------
15
+ # Initialize Groq client
16
+ # -----------------------
17
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
18
+
19
+ # -----------------------
20
+ # Helper functions
21
+ # -----------------------
22
+ def extract_text_from_pdfs(pdf_files):
23
+ text = ""
24
+ for pdf in pdf_files:
25
+ reader = PdfReader(pdf)
26
+ for page in reader.pages:
27
+ text += page.extract_text() + "\n"
28
+ return text
29
+
30
+ def chunk_text(text, chunk_size=500, overlap=100):
31
+ words = text.split()
32
+ chunks = []
33
+ i = 0
34
+ while i < len(words):
35
+ chunk = words[i:i + chunk_size]
36
+ chunks.append(" ".join(chunk))
37
+ i += chunk_size - overlap
38
+ return chunks
39
+
40
+ def retrieve_chunks(pdf_files, question):
41
+ if not pdf_files:
42
+ return "❌ Please upload PDF files."
43
+ if not question:
44
+ return "❌ Please enter a question."
45
+
46
+ text = extract_text_from_pdfs(pdf_files)
47
+ chunks = chunk_text(text)
48
+
49
+ chunk_embeddings = model.encode(chunks)
50
+ question_embedding = model.encode([question])
51
+
52
+ scores = cosine_similarity(question_embedding, chunk_embeddings)[0]
53
+ top_indices = np.argsort(scores)[-3:][::-1]
54
+
55
+ retrieved_chunks = [chunks[i] for i in top_indices]
56
+ return retrieved_chunks
57
+
58
+ # -----------------------
59
+ # RAG + Groq LLM integration
60
+ # -----------------------
61
+ def answer_question(pdf_files, question, history):
62
+ retrieved_chunks = retrieve_chunks(pdf_files, question)
63
+ if isinstance(retrieved_chunks, str):
64
+ return retrieved_chunks, history
65
+
66
+ context = "\n\n".join(retrieved_chunks)
67
+ prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer concisely:"
68
+
69
+ response = client.chat.completions.create(
70
+ messages=[
71
+ {"role": "system", "content": "You are a helpful assistant."},
72
+ {"role": "user", "content": prompt}
73
+ ],
74
+ model="llama-3.1-8b-instant",
75
+ max_tokens=300
76
+ )
77
+
78
+ answer = response.choices[0].message.content
79
+
80
+ # Update history
81
+ history = history or ""
82
+ history += f"Q: {question}\nA: {answer}\n\n"
83
+ return answer, history
84
+
85
+ # -----------------------
86
+ # PDF Summarization
87
+ # -----------------------
88
+ def summarize_pdf(pdf_files):
89
+ if not pdf_files:
90
+ return "❌ Please upload PDF files first."
91
+
92
+ text = extract_text_from_pdfs(pdf_files)
93
+ chunks = chunk_text(text)
94
+ context = "\n\n".join(chunks[:5]) # summarize first 5 chunks for speed
95
+
96
+ prompt = f"Summarize the following PDF content concisely:\n\n{context}"
97
+
98
+ response = client.chat.completions.create(
99
+ messages=[
100
+ {"role": "system", "content": "You are a helpful summarizer."},
101
+ {"role": "user", "content": prompt}
102
+ ],
103
+ model="llama-3.1-8b-instant",
104
+ max_tokens=200
105
+ )
106
+
107
+ summary = response.choices[0].message.content
108
+ return summary
109
+
110
+ # -----------------------
111
+ # Gradio UI
112
+ # -----------------------
113
+ with gr.Blocks() as demo:
114
+ gr.Markdown("## 🤖 RAG PDF Chatbot with History & PDF Summarization")
115
+
116
+ pdf_input = gr.File(
117
+ label="Upload PDF Files",
118
+ file_types=[".pdf"],
119
+ file_count="multiple"
120
+ )
121
+
122
+ question_input = gr.Textbox(
123
+ label="Ask your question here",
124
+ placeholder="e.g. What is the main objective of this document?"
125
+ )
126
+
127
+ history_box = gr.Textbox(
128
+ label="Answer History",
129
+ lines=10,
130
+ interactive=False
131
+ )
132
+
133
+ answer_box = gr.Textbox(
134
+ label="Answer",
135
+ lines=8
136
+ )
137
+
138
+ # Buttons
139
+ get_answer_btn = gr.Button("Get Answer")
140
+ summarize_btn = gr.Button("Summarize PDF")
141
+
142
+ # Button actions
143
+ get_answer_btn.click(
144
+ fn=answer_question,
145
+ inputs=[pdf_input, question_input, history_box],
146
+ outputs=[answer_box, history_box]
147
+ )
148
+
149
+ summarize_btn.click(
150
+ fn=summarize_pdf,
151
+ inputs=[pdf_input],
152
+ outputs=[answer_box]
153
+ )
154
+
155
+ demo.launch()
requirments.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ groq
3
+ pypdf
4
+ sentence-transformers
5
+ scikit-learn
6
+ numpy