deepkansara-123 commited on
Commit
cd7638d
·
verified ·
1 Parent(s): 4ad01b5

Upload 6 files

Browse files
Files changed (6) hide show
  1. ans_generator1.py +44 -0
  2. app.py +195 -0
  3. database1.py +39 -0
  4. first1.py +57 -0
  5. q_generator1.py +34 -0
  6. requirements.txt +13 -0
ans_generator1.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import PyPDF2
3
+ import os
4
+
5
+ UPLOAD_DIR = "uploaded_pdfs"
6
+
7
+ class AnswerGenerator:
8
+ def __init__(self):
9
+ # ✅ Default FLAN-T5 model for question answering
10
+ self.qa_pipeline = pipeline("question-answering", model="google/flan-t5-base")
11
+
12
+ #---------------------------------------------------------------
13
+ # updated the modal
14
+ #self.qa_pipeline = pipeline(
15
+ # "question-answering",
16
+ # model="tiiuae/falcon-7b-instruct", # <-- Updated model here
17
+ # tokenizer="tiiuae/falcon-7b-instruct" # <-- Explicitly specifying tokenizer)
18
+ #-----------------------------------------------------------------
19
+ def extract_pdf_text(self, token):
20
+ pdf_path = os.path.join(UPLOAD_DIR, f"{token}.pdf")
21
+ if not os.path.exists(pdf_path):
22
+ raise FileNotFoundError("PDF not found for given token")
23
+
24
+ with open(pdf_path, "rb") as f:
25
+ reader = PyPDF2.PdfReader(f)
26
+ return [page.extract_text() or "" for page in reader.pages] # List of page texts
27
+
28
+ def generate_answers(self, token, questions):
29
+ pages = self.extract_pdf_text(token)
30
+ full_text = "\n".join(pages) # Merge pages as context
31
+ results = []
32
+
33
+ for question in questions:
34
+ try:
35
+ # ✅ Default FLAN-T5 logic
36
+ result = self.qa_pipeline(question=question, context=full_text)
37
+ results.append({"question": question, "answer": result["answer"]})
38
+
39
+
40
+
41
+ except Exception as e:
42
+ results.append({"question": question, "answer": "Error", "error": str(e)})
43
+
44
+ return results
app.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import uuid
3
+ import sqlite3
4
+ import json
5
+ import re
6
+ import PyPDF2
7
+ import numpy as np
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+
11
+ # Local imports
12
+ from database1 import create_db
13
+ from first1 import pdf_query
14
+
15
+ from ans_generator1 import AnswerGenerator
16
+
17
+ import sqlite3, json
18
+ from q_generator1 import QGenerator
19
+ from transformers import pipeline
20
+ # Initialize models
21
+ qgen = QGenerator()
22
+ ansgen = AnswerGenerator()
23
+
24
+ # Load FLAN-T5 model
25
+ tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", use_fast=False)
26
+ model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
27
+ qa_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
28
+
29
+
30
+ # ✅ Upload and process PDF
31
+ # ✅ Updated version – supports multiple PDF files
32
+ def upload_pdf(files):
33
+ try:
34
+ messages = []
35
+
36
+ for file in files:
37
+ filename = file.name
38
+ token = str(uuid.uuid4())
39
+
40
+ pdf_reader = PyPDF2.PdfReader(file)
41
+ text = "".join([page.extract_text() or "" for page in pdf_reader.pages])
42
+ chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
43
+
44
+ create_db(token, chunks, filename, text)
45
+ messages.append(f"✅ Uploaded and stored: {filename} (Token: {token})")
46
+
47
+ return "\n".join(messages)
48
+
49
+ except Exception as e:
50
+ return f"❌ Error: {str(e)}"
51
+
52
+
53
+
54
+ # Load QG and QA once
55
+ qgen = QGenerator()
56
+ qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
57
+ def generate_qa(token):
58
+ try:
59
+ if not token:
60
+ return "⚠️ Please provide a token."
61
+
62
+ print("📥 Received Token:", token)
63
+
64
+ # Load chunk_data using token
65
+ with sqlite3.connect("my_database.db") as conn:
66
+ cursor = conn.cursor()
67
+ cursor.execute("SELECT chunk_data FROM token_data WHERE token_id = ?", (token,))
68
+ row = cursor.fetchone()
69
+
70
+ if not row:
71
+ print("❌ No data found for token in DB.")
72
+ return "❌ No data found for this token."
73
+
74
+ chunks = json.loads(row[0])
75
+ if not chunks:
76
+ print("⚠️ Chunk data is empty.")
77
+ return "⚠️ No content available in database for this PDF."
78
+
79
+ qa_pairs = []
80
+
81
+ for i, chunk in enumerate(chunks):
82
+ print(f"\n🔹 Processing chunk {i+1}/{len(chunks)}")
83
+ questions = qgen.generate(chunk)
84
+ print(f"🧠 Questions generated: {questions}")
85
+
86
+ if not questions:
87
+ print("⚠️ No questions generated for this chunk.")
88
+ continue
89
+
90
+ for question in questions[:2]: # Max 2 Qs per chunk
91
+ prompt = f"Context: {chunk}\n\nQuestion: {question}\n\nAnswer:"
92
+ print(f"➡️ Prompt:\n{prompt}")
93
+
94
+ try:
95
+ result = qa_model(prompt, max_length=256, do_sample=False)
96
+ print(f"⬅️ Raw model output: {result}")
97
+
98
+ if isinstance(result, list) and "generated_text" in result[0]:
99
+ answer = result[0]["generated_text"].strip()
100
+ elif isinstance(result, dict) and "answer" in result:
101
+ answer = result["answer"].strip()
102
+ else:
103
+ answer = "N/A"
104
+
105
+ print(f"✅ Final Answer: {answer}")
106
+ qa_pairs.append(f"Q: {question}\nA: {answer}")
107
+
108
+ except Exception as e:
109
+ print(f"❌ QA model failed: {e}")
110
+ continue
111
+
112
+ if not qa_pairs:
113
+ print("⚠️ No Q&A pairs generated.")
114
+ return "⚠️ No Q&A pairs generated."
115
+
116
+ print("✅ Final Q&A generated successfully.")
117
+ return "\n\n".join(qa_pairs)
118
+
119
+ except Exception as e:
120
+ print(f"🔥 Exception in generate_qa(): {e}")
121
+ return f"❌ Error: {str(e)}"
122
+
123
+
124
+ # ✅ Ask question using token (semantic similarity)
125
+ def ask_question(token, question):
126
+ try:
127
+ with sqlite3.connect("my_database.db") as conn:
128
+ cursor = conn.cursor()
129
+ cursor.execute("SELECT chunk_data FROM token_data WHERE token_id = ?", (token,))
130
+ row = cursor.fetchone()
131
+
132
+ if not row:
133
+ return "❌ Token not found."
134
+
135
+ chunks = json.loads(row[0])
136
+ processor = pdf_query()
137
+ model = processor.model
138
+
139
+ clean_chunks = [re.sub(r'\s+', ' ', c.strip()) for c in chunks if c.strip()]
140
+ if not clean_chunks:
141
+ return "⚠️ No valid content found in PDF."
142
+
143
+ chunk_embeddings = model.encode(clean_chunks)
144
+ q_embedding = model.encode([question])
145
+ scores = cosine_similarity(q_embedding, chunk_embeddings)[0]
146
+
147
+ top_index = int(np.argmax(scores))
148
+ top_score = float(scores[top_index])
149
+ best_text = clean_chunks[top_index]
150
+
151
+ return f"Q: {question}\nA: {best_text}\nScore: {round(top_score, 3)}"
152
+
153
+ except Exception as e:
154
+ return f"❌ Error: {str(e)}"
155
+
156
+
157
+
158
+
159
+
160
+
161
+ # ✅ Gradio UI
162
+ with gr.Blocks(theme="default") as demo:
163
+ gr.Markdown(
164
+ """
165
+ <div style='text-align: center; padding: 1rem;'>
166
+ <h1 style='color: #3b82f6;'>📄 AI-Powered PDF Q&A System</h1>
167
+ <p style='font-size: 1.1rem;'>Upload your PDFs, generate smart questions, and get intelligent answers.</p>
168
+ </div>
169
+ """
170
+ )
171
+
172
+ with gr.Tab("📤 1. Upload PDF"):
173
+ gr.Markdown("### 🗂 Upload a PDF File")
174
+ file = gr.File(label="Upload one or more PDFs", file_types=[".pdf"], file_count="multiple")
175
+
176
+ upload_out = gr.Textbox(label="Upload Result", interactive=False)
177
+ file.change(fn=upload_pdf, inputs=file, outputs=upload_out)
178
+
179
+ with gr.Blocks(title="PDF Q&A Generator") as demo:
180
+ with gr.Tab("🧠 2. Generate Questions & Answers"):
181
+ gr.Markdown("### 🤖 Generate Questions and Answers from Uploaded PDF")
182
+ fname = gr.Textbox(label="📄 Enter Uploaded Filename", placeholder="example.pdf")
183
+ output_box = gr.Textbox(label="📝 Generated Q&A", lines=15, interactive=False)
184
+ gr.Button("🚀 Generate Q&A").click(fn=generate_qa, inputs=fname, outputs=output_box)
185
+
186
+ with gr.Tab("❓ 3. Ask a Question"):
187
+ gr.Markdown("### 💬 Ask a question based on uploaded PDF")
188
+ token_box = gr.Textbox(label="Token ID", placeholder="e.g., 123e4567-e89b-12d3-a456...")
189
+ question_box = gr.Textbox(label="Type your question", placeholder="What is the main topic discussed?")
190
+ answer_result = gr.Textbox(label="Answer Output", lines=6, interactive=False)
191
+ gr.Button("🎯 Get Answer").click(fn=ask_question, inputs=[token_box, question_box], outputs=answer_result)
192
+
193
+ if __name__ == "__main__":
194
+ demo.launch(server_name="0.0.0.0", server_port=7860)
195
+
database1.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import json
3
+
4
+ class create_db:
5
+ def __init__(self, token, chunk_json1, filename, full_content):
6
+ conn = sqlite3.connect('my_database.db')
7
+ cursor = conn.cursor()
8
+
9
+ cursor.execute("""
10
+ CREATE TABLE IF NOT EXISTS token_data (
11
+ token_id TEXT PRIMARY KEY,
12
+ chunk_data TEXT,
13
+ filename TEXT,
14
+ full_content TEXT
15
+ )
16
+ """)
17
+
18
+ chunk_json = json.dumps(chunk_json1)
19
+
20
+ try:
21
+ cursor.execute(
22
+ "INSERT INTO token_data (token_id, chunk_data, filename, full_content) VALUES (?, ?, ?, ?)",
23
+ (token, chunk_json, filename, full_content)
24
+ )
25
+ conn.commit()
26
+ print({"message": f"✅ {filename} uploaded and stored successfully"})
27
+ except sqlite3.IntegrityError:
28
+ print({"error": f"❌ Token already exists for: {filename}"})
29
+
30
+ conn.close()
31
+
32
+ @staticmethod
33
+ def get_all_filenames():
34
+ conn = sqlite3.connect('my_database.db')
35
+ cursor = conn.cursor()
36
+ cursor.execute("SELECT filename FROM token_data")
37
+ rows = cursor.fetchall()
38
+ conn.close()
39
+ return {"pdfs": [{"filename": row[0]} for row in rows]}
first1.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from sentence_transformers import SentenceTransformer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+ import re
6
+
7
+
8
+ class pdf_query:
9
+ def __init__(self):
10
+ self.model = SentenceTransformer("all-MiniLM-L6-v2")
11
+ self.read = None
12
+
13
+ def file(self, file):
14
+ self.read = PyPDF2.PdfReader(file)
15
+
16
+ def extract_text(self):
17
+ text = ""
18
+ for page in self.read.pages:
19
+ content = page.extract_text()
20
+ if content:
21
+ text += content + "\n"
22
+ return text.strip()
23
+
24
+ def split_into_chunks(self, text, chunk_size=300):
25
+ # Split using punctuation for better sentence boundaries
26
+ sentences = re.split(r'(?<=[.!?])\s+', text)
27
+ chunks = []
28
+ current_chunk = ""
29
+
30
+ for sentence in sentences:
31
+ if len(current_chunk) + len(sentence) <= chunk_size:
32
+ current_chunk += sentence + " "
33
+ else:
34
+ chunks.append(current_chunk.strip())
35
+ current_chunk = sentence + " "
36
+ if current_chunk:
37
+ chunks.append(current_chunk.strip())
38
+
39
+ return chunks
40
+
41
+ def creat_model(self,chunks):
42
+ model = SentenceTransformer("all-MiniLM-L6-v2")
43
+ chunk_embeddings = model.encode(chunks)
44
+ return model,chunk_embeddings
45
+
46
+ def answer_question(self,question, chunks, chunk_embeddings,model,threshold=0.6):
47
+ q_embedding = model.encode([question]) # same model as above
48
+ scores = cosine_similarity(q_embedding, chunk_embeddings)
49
+ best_score = np.max(scores)
50
+ best_chunk_index = np.argmax(scores)
51
+ if best_score >= threshold:
52
+ best_chunk = chunks[best_chunk_index]
53
+ # Clean the answer
54
+ cleaned_answer = re.sub(r'\s+', ' ', best_chunk.strip())
55
+ return cleaned_answer
56
+ else:
57
+ return {"answer": "Answer not found in PDF"}
q_generator1.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, pipeline
2
+
3
+ class QGenerator:
4
+ def __init__(self):
5
+ tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-hl", use_fast=False)
6
+ model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qg-hl")
7
+ self.qg = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
8
+
9
+ def split_sentences(self, text):
10
+ # Simple sentence splitting (for better results, use nltk or spacy)
11
+ return [s.strip() for s in text.split('.') if s.strip()]
12
+
13
+ def chunk_text(self, text, chunk_size=512):
14
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
15
+
16
+ def generate(self, text, max_questions=5):
17
+ questions = []
18
+ sentences = self.split_sentences(text)
19
+
20
+ for sentence in sentences:
21
+ if len(questions) >= max_questions:
22
+ break
23
+
24
+ input_text = f"generate question: {sentence} </s>"
25
+ try:
26
+ result = self.qg(input_text, max_length=64, num_return_sequences=1)[0]
27
+ question = result["generated_text"]
28
+ if question and question not in questions:
29
+ questions.append(question)
30
+ except Exception as e:
31
+ print("Error generating question:", e)
32
+ continue
33
+
34
+ return questions
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ sentence-transformers
5
+ PyPDF2
6
+ scikit-learn
7
+ numpy
8
+ uuid
9
+ sentence_transformers
10
+ sentencepiece
11
+ tiktoken
12
+
13
+