deepkansara-123 commited on
Commit
9ddeec6
·
verified ·
1 Parent(s): 5b28109

Upload 6 files

Browse files
Files changed (6) hide show
  1. ans_generator1.py +44 -0
  2. app.py +116 -0
  3. database1.py +46 -0
  4. first1.py +57 -0
  5. q_generator1.py +34 -0
  6. requirements.txt +8 -0
ans_generator1.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import PyPDF2
3
+ import os
4
+
5
+ UPLOAD_DIR = "uploaded_pdfs"
6
+
7
+ class AnswerGenerator:
8
+ def __init__(self):
9
+ # ✅ Default FLAN-T5 model for question answering
10
+ self.qa_pipeline = pipeline("question-answering", model="google/flan-t5-base")
11
+
12
+ #---------------------------------------------------------------
13
+ # updated the modal
14
+ #self.qa_pipeline = pipeline(
15
+ # "question-answering",
16
+ # model="tiiuae/falcon-7b-instruct", # <-- Updated model here
17
+ # tokenizer="tiiuae/falcon-7b-instruct" # <-- Explicitly specifying tokenizer)
18
+ #-----------------------------------------------------------------
19
+ def extract_pdf_text(self, token):
20
+ pdf_path = os.path.join(UPLOAD_DIR, f"{token}.pdf")
21
+ if not os.path.exists(pdf_path):
22
+ raise FileNotFoundError("PDF not found for given token")
23
+
24
+ with open(pdf_path, "rb") as f:
25
+ reader = PyPDF2.PdfReader(f)
26
+ return [page.extract_text() or "" for page in reader.pages] # List of page texts
27
+
28
+ def generate_answers(self, token, questions):
29
+ pages = self.extract_pdf_text(token)
30
+ full_text = "\n".join(pages) # Merge pages as context
31
+ results = []
32
+
33
+ for question in questions:
34
+ try:
35
+ # ✅ Default FLAN-T5 logic
36
+ result = self.qa_pipeline(question=question, context=full_text)
37
+ results.append({"question": question, "answer": result["answer"]})
38
+
39
+
40
+
41
+ except Exception as e:
42
+ results.append({"question": question, "answer": "Error", "error": str(e)})
43
+
44
+ return results
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import uuid
3
+ import sqlite3
4
+ import json
5
+ import re
6
+ import PyPDF2
7
+ import io
8
+ import numpy as np
9
+ from transformers import pipeline
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+ from database1 import create_db
13
+ from first1 import pdf_query
14
+ from q_generator1 import QGenerator
15
+ from ans_generator1 import AnswerGenerator
16
+
17
+ # Models and tools
18
+ qgen = QGenerator()
19
+ ansgen = AnswerGenerator()
20
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", use_fast=False)
23
+ model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
24
+ qa_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
25
+
26
+
27
+ # ✅ Upload PDF and store to SQLite
28
+ def upload_pdf(file):
29
+ try:
30
+ filename = file.name
31
+ token = str(uuid.uuid4())
32
+
33
+ pdf_reader = PyPDF2.PdfReader(file)
34
+ text = "".join([page.extract_text() or "" for page in pdf_reader.pages])
35
+ chunks = [text[i:i+500] for i in range(0, len(text), 500)]
36
+
37
+ create_db(token, chunks, filename, text)
38
+ return f"✅ Uploaded and stored: {filename} (Token: {token})"
39
+ except Exception as e:
40
+ return f"❌ Error: {str(e)}"
41
+
42
+ # ✅ Generate Q&A from filename
43
+ def generate_qa(filename):
44
+ try:
45
+ with sqlite3.connect("my_database.db") as conn:
46
+ cursor = conn.cursor()
47
+ cursor.execute("SELECT chunk_data FROM token_data WHERE filename = ?", (filename,))
48
+ row = cursor.fetchone()
49
+
50
+ if not row:
51
+ return "❌ No data found for this filename."
52
+
53
+ chunks = json.loads(row[0])
54
+ qa_pairs = []
55
+
56
+ for chunk in chunks:
57
+ questions = qgen.generate(chunk)
58
+ if not questions:
59
+ continue
60
+ question = questions[0]
61
+ prompt = f"Context: {chunk}\n\nQuestion: {question}\n\nAnswer:"
62
+ result = qa_model(prompt, max_length=256, do_sample=False)
63
+ answer = result[0]["generated_text"].strip()
64
+ qa_pairs.append(f"Q: {question}\nA: {answer}")
65
+ return "\n\n".join(qa_pairs)
66
+ except Exception as e:
67
+ return f"❌ Error: {str(e)}"
68
+
69
+ # ✅ Ask a question using token (semantic similarity)
70
+ def ask_question(token, question):
71
+ try:
72
+ with sqlite3.connect("my_database.db") as conn:
73
+ cursor = conn.cursor()
74
+ cursor.execute("SELECT chunk_data FROM token_data WHERE token_id = ?", (token,))
75
+ row = cursor.fetchone()
76
+ if not row:
77
+ return "❌ Token not found."
78
+
79
+ chunks = json.loads(row[0])
80
+ processor = pdf_query()
81
+ model = processor.model
82
+ chunk_embeddings = model.encode(chunks)
83
+ q_embedding = model.encode([question])
84
+ scores = cosine_similarity(q_embedding, chunk_embeddings)[0]
85
+ top_index = int(np.argmax(scores))
86
+ top_score = float(scores[top_index])
87
+ best_text = re.sub(r'\s+', ' ', chunks[top_index].strip())
88
+
89
+ if top_score >= 0.5:
90
+ return f"Q: {question}\nA: {best_text}\nScore: {round(top_score, 3)}"
91
+ else:
92
+ return "⚠️ No relevant answer found (score too low)."
93
+ except Exception as e:
94
+ return f"❌ Error: {str(e)}"
95
+
96
+ # ✅ Gradio UI
97
+ with gr.Blocks() as demo:
98
+ gr.Markdown("# 📄 PDF QA System")
99
+
100
+ with gr.Tab("1. Upload PDF"):
101
+ file = gr.File(label="Upload a PDF")
102
+ upload_out = gr.Textbox(label="Upload Result")
103
+ file.change(fn=upload_pdf, inputs=file, outputs=upload_out)
104
+
105
+ with gr.Tab("2. Generate Q&A"):
106
+ fname = gr.Textbox(label="Enter uploaded filename")
107
+ qa_result = gr.Textbox(label="Q&A Output", lines=10)
108
+ gr.Button("Generate Q&A").click(fn=generate_qa, inputs=fname, outputs=qa_result)
109
+
110
+ with gr.Tab("3. Ask a Question"):
111
+ token_box = gr.Textbox(label="Enter Token ID")
112
+ question_box = gr.Textbox(label="Your Question")
113
+ answer_result = gr.Textbox(label="Answer", lines=5)
114
+ gr.Button("Ask").click(fn=ask_question, inputs=[token_box, question_box], outputs=answer_result)
115
+
116
+ demo.launch()
database1.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import json
3
+
4
+ class create_db:
5
+ def __init__(self, token, chunk_json1, filename, full_content):
6
+ conn = sqlite3.connect('my_database.db')
7
+ cursor = conn.cursor()
8
+
9
+ # Only store into this table
10
+ cursor.execute("""
11
+ CREATE TABLE IF NOT EXISTS token_data (
12
+ token_id TEXT PRIMARY KEY,
13
+ chunk_data TEXT,
14
+ filename TEXT,
15
+ full_content TEXT
16
+ )
17
+ """)
18
+
19
+ chunk_json = json.dumps(chunk_json1)
20
+
21
+ try:
22
+ cursor.execute(
23
+ "INSERT INTO token_data (token_id, chunk_data, filename, full_content) VALUES (?, ?, ?, ?)",
24
+ (token, chunk_json, filename, full_content)
25
+ )
26
+ conn.commit()
27
+ print({"message": "PDF uploaded and stored successfully"})
28
+ except sqlite3.IntegrityError:
29
+ print({"error": "Token already exists."})
30
+
31
+ conn.close()
32
+
33
+ @staticmethod
34
+ def get_all_filenames():
35
+ conn = sqlite3.connect('my_database.db')
36
+ cursor = conn.cursor()
37
+ cursor.execute("SELECT filename FROM token_data")
38
+ rows = cursor.fetchall()
39
+ conn.close()
40
+
41
+ if rows:
42
+ return {
43
+ "pdfs": [{"filename": row[0]} for row in rows]
44
+ }
45
+ else:
46
+ return {"pdfs": []}
first1.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from sentence_transformers import SentenceTransformer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+ import re
6
+
7
+
8
+ class pdf_query:
9
+ def __init__(self):
10
+ self.model = SentenceTransformer("all-MiniLM-L6-v2")
11
+ self.read = None
12
+
13
+ def file(self, file):
14
+ self.read = PyPDF2.PdfReader(file)
15
+
16
+ def extract_text(self):
17
+ text = ""
18
+ for page in self.read.pages:
19
+ content = page.extract_text()
20
+ if content:
21
+ text += content + "\n"
22
+ return text.strip()
23
+
24
+ def split_into_chunks(self, text, chunk_size=300):
25
+ # Split using punctuation for better sentence boundaries
26
+ sentences = re.split(r'(?<=[.!?])\s+', text)
27
+ chunks = []
28
+ current_chunk = ""
29
+
30
+ for sentence in sentences:
31
+ if len(current_chunk) + len(sentence) <= chunk_size:
32
+ current_chunk += sentence + " "
33
+ else:
34
+ chunks.append(current_chunk.strip())
35
+ current_chunk = sentence + " "
36
+ if current_chunk:
37
+ chunks.append(current_chunk.strip())
38
+
39
+ return chunks
40
+
41
+ def creat_model(self,chunks):
42
+ model = SentenceTransformer("all-MiniLM-L6-v2")
43
+ chunk_embeddings = model.encode(chunks)
44
+ return model,chunk_embeddings
45
+
46
+ def answer_question(self,question, chunks, chunk_embeddings,model,threshold=0.6):
47
+ q_embedding = model.encode([question]) # same model as above
48
+ scores = cosine_similarity(q_embedding, chunk_embeddings)
49
+ best_score = np.max(scores)
50
+ best_chunk_index = np.argmax(scores)
51
+ if best_score >= threshold:
52
+ best_chunk = chunks[best_chunk_index]
53
+ # Clean the answer
54
+ cleaned_answer = re.sub(r'\s+', ' ', best_chunk.strip())
55
+ return cleaned_answer
56
+ else:
57
+ return {"answer": "Answer not found in PDF"}
q_generator1.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
2
+
3
+ class QGenerator:
4
+ def __init__(self):
5
+ tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-small-qg-hl", use_fast=False)
6
+ model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qg-hl")
7
+ self.qg = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
8
+
9
+ def split_sentences(self, text):
10
+ # Simple sentence splitting (for better results, use nltk or spacy)
11
+ return [s.strip() for s in text.split('.') if s.strip()]
12
+
13
+ def chunk_text(self, text, chunk_size=512):
14
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
15
+
16
+ def generate(self, text, max_questions=5):
17
+ questions = []
18
+ sentences = self.split_sentences(text)
19
+
20
+ for sentence in sentences:
21
+ if len(questions) >= max_questions:
22
+ break
23
+
24
+ input_text = f"generate question: {sentence} </s>"
25
+ try:
26
+ result = self.qg(input_text, max_length=64, num_return_sequences=1)[0]
27
+ question = result["generated_text"]
28
+ if question and question not in questions:
29
+ questions.append(question)
30
+ except Exception as e:
31
+ print("Error generating question:", e)
32
+ continue
33
+
34
+ return questions
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ sentence-transformers
5
+ PyPDF2
6
+ scikit-learn
7
+ numpy
8
+ uuid