deepkansara-123 commited on
Commit
8ca7a2a
·
verified ·
1 Parent(s): 3221f2f

Upload 6 files

Browse files
Files changed (4) hide show
  1. app.py +126 -47
  2. database1.py +3 -10
  3. q_generator1.py +2 -2
  4. requirements.txt +5 -0
app.py CHANGED
@@ -4,113 +4,192 @@ import sqlite3
4
  import json
5
  import re
6
  import PyPDF2
7
- import io
8
  import numpy as np
9
- from transformers import pipeline
10
  from sklearn.metrics.pairwise import cosine_similarity
11
 
 
12
  from database1 import create_db
13
  from first1 import pdf_query
14
- from q_generator1 import QGenerator
15
  from ans_generator1 import AnswerGenerator
16
 
17
- # Models and tools
 
 
 
18
  qgen = QGenerator()
19
  ansgen = AnswerGenerator()
20
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
21
 
 
22
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", use_fast=False)
23
  model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
24
  qa_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
25
 
26
 
27
- # ✅ Upload PDF and store to SQLite
28
- def upload_pdf(file):
 
29
  try:
30
- filename = file.name
31
- token = str(uuid.uuid4())
32
 
33
- pdf_reader = PyPDF2.PdfReader(file)
34
- text = "".join([page.extract_text() or "" for page in pdf_reader.pages])
35
- chunks = [text[i:i+500] for i in range(0, len(text), 500)]
 
 
 
 
 
 
 
 
 
36
 
37
- create_db(token, chunks, filename, text)
38
- return f"✅ Uploaded and stored: {filename} (Token: {token})"
39
  except Exception as e:
40
  return f"❌ Error: {str(e)}"
41
 
42
- # ✅ Generate Q&A from filename
43
- def generate_qa(filename):
 
 
 
 
44
  try:
 
 
 
 
 
 
45
  with sqlite3.connect("my_database.db") as conn:
46
  cursor = conn.cursor()
47
- cursor.execute("SELECT chunk_data FROM token_data WHERE filename = ?", (filename,))
48
  row = cursor.fetchone()
49
 
50
  if not row:
51
- return "❌ No data found for this filename."
 
52
 
53
  chunks = json.loads(row[0])
 
 
 
 
54
  qa_pairs = []
55
 
56
- for chunk in chunks:
 
57
  questions = qgen.generate(chunk)
 
 
58
  if not questions:
 
59
  continue
60
- question = questions[0]
61
- prompt = f"Context: {chunk}\n\nQuestion: {question}\n\nAnswer:"
62
- result = qa_model(prompt, max_length=256, do_sample=False)
63
- answer = result[0]["generated_text"].strip()
64
- qa_pairs.append(f"Q: {question}\nA: {answer}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  return "\n\n".join(qa_pairs)
 
66
  except Exception as e:
 
67
  return f"❌ Error: {str(e)}"
68
 
69
- # ✅ Ask a question using token (semantic similarity)
 
70
  def ask_question(token, question):
71
  try:
72
  with sqlite3.connect("my_database.db") as conn:
73
  cursor = conn.cursor()
74
  cursor.execute("SELECT chunk_data FROM token_data WHERE token_id = ?", (token,))
75
  row = cursor.fetchone()
 
76
  if not row:
77
  return "❌ Token not found."
78
 
79
  chunks = json.loads(row[0])
80
  processor = pdf_query()
81
  model = processor.model
82
- chunk_embeddings = model.encode(chunks)
 
 
 
 
 
83
  q_embedding = model.encode([question])
84
  scores = cosine_similarity(q_embedding, chunk_embeddings)[0]
 
85
  top_index = int(np.argmax(scores))
86
  top_score = float(scores[top_index])
87
- best_text = re.sub(r'\s+', ' ', chunks[top_index].strip())
 
 
88
 
89
- if top_score >= 0.5:
90
- return f"Q: {question}\nA: {best_text}\nScore: {round(top_score, 3)}"
91
- else:
92
- return "⚠️ No relevant answer found (score too low)."
93
  except Exception as e:
94
  return f"❌ Error: {str(e)}"
95
 
96
- # ✅ Gradio UI
97
- with gr.Blocks() as demo:
98
- gr.Markdown("# 📄 PDF QA System")
99
 
100
- with gr.Tab("1. Upload PDF"):
101
- file = gr.File(label="Upload a PDF")
102
- upload_out = gr.Textbox(label="Upload Result")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  file.change(fn=upload_pdf, inputs=file, outputs=upload_out)
104
 
105
- with gr.Tab("2. Generate Q&A"):
106
- fname = gr.Textbox(label="Enter uploaded filename")
107
- qa_result = gr.Textbox(label="Q&A Output", lines=10)
108
- gr.Button("Generate Q&A").click(fn=generate_qa, inputs=fname, outputs=qa_result)
 
 
 
 
 
 
 
 
 
109
 
110
- with gr.Tab("3. Ask a Question"):
111
- token_box = gr.Textbox(label="Enter Token ID")
112
- question_box = gr.Textbox(label="Your Question")
113
- answer_result = gr.Textbox(label="Answer", lines=5)
114
- gr.Button("Ask").click(fn=ask_question, inputs=[token_box, question_box], outputs=answer_result)
115
 
116
- demo.launch()
 
4
  import json
5
  import re
6
  import PyPDF2
 
7
  import numpy as np
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
11
+ # Local imports
12
  from database1 import create_db
13
  from first1 import pdf_query
14
+
15
  from ans_generator1 import AnswerGenerator
16
 
17
+ import sqlite3, json
18
+ from q_generator1 import QGenerator
19
+ from transformers import pipeline
20
+ # Initialize models
21
  qgen = QGenerator()
22
  ansgen = AnswerGenerator()
 
23
 
24
+ # Load FLAN-T5 model
25
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", use_fast=False)
26
  model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
27
  qa_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
28
 
29
 
30
+ # ✅ Upload and process PDF
31
+ # ✅ Updated version – supports multiple PDF files
32
+ def upload_pdf(files):
33
  try:
34
+ messages = []
 
35
 
36
+ for file in files:
37
+ filename = file.name
38
+ token = str(uuid.uuid4())
39
+
40
+ pdf_reader = PyPDF2.PdfReader(file)
41
+ text = "".join([page.extract_text() or "" for page in pdf_reader.pages])
42
+ chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
43
+
44
+ create_db(token, chunks, filename, text)
45
+ messages.append(f"✅ Uploaded and stored: {filename} (Token: {token})")
46
+
47
+ return "\n".join(messages)
48
 
 
 
49
  except Exception as e:
50
  return f"❌ Error: {str(e)}"
51
 
52
+
53
+
54
+ # Load QG and QA once
55
+ qgen = QGenerator()
56
+ qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
57
+ def generate_qa(token):
58
  try:
59
+ if not token:
60
+ return "⚠️ Please provide a token."
61
+
62
+ print("📥 Received Token:", token)
63
+
64
+ # Load chunk_data using token
65
  with sqlite3.connect("my_database.db") as conn:
66
  cursor = conn.cursor()
67
+ cursor.execute("SELECT chunk_data FROM token_data WHERE token_id = ?", (token,))
68
  row = cursor.fetchone()
69
 
70
  if not row:
71
+ print("❌ No data found for token in DB.")
72
+ return "❌ No data found for this token."
73
 
74
  chunks = json.loads(row[0])
75
+ if not chunks:
76
+ print("⚠️ Chunk data is empty.")
77
+ return "⚠️ No content available in database for this PDF."
78
+
79
  qa_pairs = []
80
 
81
+ for i, chunk in enumerate(chunks):
82
+ print(f"\n🔹 Processing chunk {i+1}/{len(chunks)}")
83
  questions = qgen.generate(chunk)
84
+ print(f"🧠 Questions generated: {questions}")
85
+
86
  if not questions:
87
+ print("⚠️ No questions generated for this chunk.")
88
  continue
89
+
90
+ for question in questions[:2]: # Max 2 Qs per chunk
91
+ prompt = f"Context: {chunk}\n\nQuestion: {question}\n\nAnswer:"
92
+ print(f"➡️ Prompt:\n{prompt}")
93
+
94
+ try:
95
+ result = qa_model(prompt, max_length=256, do_sample=False)
96
+ print(f"⬅️ Raw model output: {result}")
97
+
98
+ if isinstance(result, list) and "generated_text" in result[0]:
99
+ answer = result[0]["generated_text"].strip()
100
+ elif isinstance(result, dict) and "answer" in result:
101
+ answer = result["answer"].strip()
102
+ else:
103
+ answer = "N/A"
104
+
105
+ print(f"✅ Final Answer: {answer}")
106
+ qa_pairs.append(f"Q: {question}\nA: {answer}")
107
+
108
+ except Exception as e:
109
+ print(f"❌ QA model failed: {e}")
110
+ continue
111
+
112
+ if not qa_pairs:
113
+ print("⚠️ No Q&A pairs generated.")
114
+ return "⚠️ No Q&A pairs generated."
115
+
116
+ print("✅ Final Q&A generated successfully.")
117
  return "\n\n".join(qa_pairs)
118
+
119
  except Exception as e:
120
+ print(f"🔥 Exception in generate_qa(): {e}")
121
  return f"❌ Error: {str(e)}"
122
 
123
+
124
+ # ✅ Ask question using token (semantic similarity)
125
  def ask_question(token, question):
126
  try:
127
  with sqlite3.connect("my_database.db") as conn:
128
  cursor = conn.cursor()
129
  cursor.execute("SELECT chunk_data FROM token_data WHERE token_id = ?", (token,))
130
  row = cursor.fetchone()
131
+
132
  if not row:
133
  return "❌ Token not found."
134
 
135
  chunks = json.loads(row[0])
136
  processor = pdf_query()
137
  model = processor.model
138
+
139
+ clean_chunks = [re.sub(r'\s+', ' ', c.strip()) for c in chunks if c.strip()]
140
+ if not clean_chunks:
141
+ return "⚠️ No valid content found in PDF."
142
+
143
+ chunk_embeddings = model.encode(clean_chunks)
144
  q_embedding = model.encode([question])
145
  scores = cosine_similarity(q_embedding, chunk_embeddings)[0]
146
+
147
  top_index = int(np.argmax(scores))
148
  top_score = float(scores[top_index])
149
+ best_text = clean_chunks[top_index]
150
+
151
+ return f"Q: {question}\nA: {best_text}\nScore: {round(top_score, 3)}"
152
 
 
 
 
 
153
  except Exception as e:
154
  return f"❌ Error: {str(e)}"
155
 
 
 
 
156
 
157
+
158
+
159
+
160
+
161
+ # ✅ Gradio UI
162
+ with gr.Blocks(theme="default") as demo:
163
+ gr.Markdown(
164
+ """
165
+ <div style='text-align: center; padding: 1rem;'>
166
+ <h1 style='color: #3b82f6;'>📄 AI-Powered PDF Q&A System</h1>
167
+ <p style='font-size: 1.1rem;'>Upload your PDFs, generate smart questions, and get intelligent answers.</p>
168
+ </div>
169
+ """
170
+ )
171
+
172
+ with gr.Tab("📤 1. Upload PDF"):
173
+ gr.Markdown("### 🗂 Upload a PDF File")
174
+ file = gr.File(label="Upload one or more PDFs", file_types=[".pdf"], file_count="multiple")
175
+
176
+ upload_out = gr.Textbox(label="Upload Result", interactive=False)
177
  file.change(fn=upload_pdf, inputs=file, outputs=upload_out)
178
 
179
+ with gr.Blocks(title="PDF Q&A Generator") as demo:
180
+ with gr.Tab("🧠 2. Generate Questions & Answers"):
181
+ gr.Markdown("### 🤖 Generate Questions and Answers from Uploaded PDF")
182
+ fname = gr.Textbox(label="📄 Enter Uploaded Filename", placeholder="example.pdf")
183
+ output_box = gr.Textbox(label="📝 Generated Q&A", lines=15, interactive=False)
184
+ gr.Button("🚀 Generate Q&A").click(fn=generate_qa, inputs=fname, outputs=output_box)
185
+
186
+ with gr.Tab("❓ 3. Ask a Question"):
187
+ gr.Markdown("### 💬 Ask a question based on uploaded PDF")
188
+ token_box = gr.Textbox(label="Token ID", placeholder="e.g., 123e4567-e89b-12d3-a456...")
189
+ question_box = gr.Textbox(label="Type your question", placeholder="What is the main topic discussed?")
190
+ answer_result = gr.Textbox(label="Answer Output", lines=6, interactive=False)
191
+ gr.Button("🎯 Get Answer").click(fn=ask_question, inputs=[token_box, question_box], outputs=answer_result)
192
 
193
+ if __name__ == "__main__":
194
+ demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
195
 
 
database1.py CHANGED
@@ -6,7 +6,6 @@ class create_db:
6
  conn = sqlite3.connect('my_database.db')
7
  cursor = conn.cursor()
8
 
9
- # Only store into this table
10
  cursor.execute("""
11
  CREATE TABLE IF NOT EXISTS token_data (
12
  token_id TEXT PRIMARY KEY,
@@ -24,9 +23,9 @@ class create_db:
24
  (token, chunk_json, filename, full_content)
25
  )
26
  conn.commit()
27
- print({"message": "PDF uploaded and stored successfully"})
28
  except sqlite3.IntegrityError:
29
- print({"error": "Token already exists."})
30
 
31
  conn.close()
32
 
@@ -37,10 +36,4 @@ class create_db:
37
  cursor.execute("SELECT filename FROM token_data")
38
  rows = cursor.fetchall()
39
  conn.close()
40
-
41
- if rows:
42
- return {
43
- "pdfs": [{"filename": row[0]} for row in rows]
44
- }
45
- else:
46
- return {"pdfs": []}
 
6
  conn = sqlite3.connect('my_database.db')
7
  cursor = conn.cursor()
8
 
 
9
  cursor.execute("""
10
  CREATE TABLE IF NOT EXISTS token_data (
11
  token_id TEXT PRIMARY KEY,
 
23
  (token, chunk_json, filename, full_content)
24
  )
25
  conn.commit()
26
+ print({"message": f" {filename} uploaded and stored successfully"})
27
  except sqlite3.IntegrityError:
28
+ print({"error": f"Token already exists for: {filename}"})
29
 
30
  conn.close()
31
 
 
36
  cursor.execute("SELECT filename FROM token_data")
37
  rows = cursor.fetchall()
38
  conn.close()
39
+ return {"pdfs": [{"filename": row[0]} for row in rows]}
 
 
 
 
 
 
q_generator1.py CHANGED
@@ -1,8 +1,8 @@
1
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
2
 
3
  class QGenerator:
4
  def __init__(self):
5
- tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-small-qg-hl", use_fast=False)
6
  model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qg-hl")
7
  self.qg = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
8
 
 
1
+ from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, pipeline
2
 
3
  class QGenerator:
4
  def __init__(self):
5
+ tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-hl", use_fast=False)
6
  model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qg-hl")
7
  self.qg = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
8
 
requirements.txt CHANGED
@@ -6,3 +6,8 @@ PyPDF2
6
  scikit-learn
7
  numpy
8
  uuid
 
 
 
 
 
 
6
  scikit-learn
7
  numpy
8
  uuid
9
+ sentence_transformers
10
+ sentencepiece
11
+ tiktoken
12
+
13
+