Spaces:

deepkansara-123
/

modal_reader

Sleeping

App Files Files Community

deepkansara-123 commited on Jul 22, 2025

Commit

cd7638d

verified ·

1 Parent(s): 4ad01b5

Upload 6 files

Browse files

Files changed (6) hide show

ans_generator1.py +44 -0
app.py +195 -0
database1.py +39 -0
first1.py +57 -0
q_generator1.py +34 -0
requirements.txt +13 -0

ans_generator1.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from transformers import pipeline
+import PyPDF2
+import os
+UPLOAD_DIR = "uploaded_pdfs"
+class AnswerGenerator:
+    def __init__(self):
+        # ✅ Default FLAN-T5 model for question answering
+        self.qa_pipeline = pipeline("question-answering", model="google/flan-t5-base")
+#---------------------------------------------------------------
+# updated the modal
+ #self.qa_pipeline = pipeline(
+           # "question-answering",
+          #  model="tiiuae/falcon-7b-instruct",    # <-- Updated model here
+          #  tokenizer="tiiuae/falcon-7b-instruct"  # <-- Explicitly specifying tokenizer)
+#-----------------------------------------------------------------
+    def extract_pdf_text(self, token):
+        pdf_path = os.path.join(UPLOAD_DIR, f"{token}.pdf")
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError("PDF not found for given token")
+        with open(pdf_path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            return [page.extract_text() or "" for page in reader.pages]  # List of page texts
+    def generate_answers(self, token, questions):
+        pages = self.extract_pdf_text(token)
+        full_text = "\n".join(pages)  # Merge pages as context
+        results = []
+        for question in questions:
+            try:
+                # ✅ Default FLAN-T5 logic
+                result = self.qa_pipeline(question=question, context=full_text)
+                results.append({"question": question, "answer": result["answer"]})
+            except Exception as e:
+                results.append({"question": question, "answer": "Error", "error": str(e)})
+        return results

app.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import gradio as gr
+import uuid
+import sqlite3
+import json
+import re
+import PyPDF2
+import numpy as np
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+from sklearn.metrics.pairwise import cosine_similarity
+# Local imports
+from database1 import create_db
+from first1 import pdf_query
+from ans_generator1 import AnswerGenerator
+import sqlite3, json
+from q_generator1 import QGenerator
+from transformers import pipeline
+# Initialize models
+qgen = QGenerator()
+ansgen = AnswerGenerator()
+# Load FLAN-T5 model
+tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", use_fast=False)
+model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
+qa_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
+# ✅ Upload and process PDF
+# ✅ Updated version – supports multiple PDF files
+def upload_pdf(files):
+    try:
+        messages = []
+        for file in files:
+            filename = file.name
+            token = str(uuid.uuid4())
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = "".join([page.extract_text() or "" for page in pdf_reader.pages])
+            chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
+            create_db(token, chunks, filename, text)
+            messages.append(f"✅ Uploaded and stored: {filename} (Token: {token})")
+        return "\n".join(messages)
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# Load QG and QA once
+qgen = QGenerator()
+qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
+def generate_qa(token):
+    try:
+        if not token:
+            return "⚠️ Please provide a token."
+        print("📥 Received Token:", token)
+        # Load chunk_data using token
+        with sqlite3.connect("my_database.db") as conn:
+            cursor = conn.cursor()
+            cursor.execute("SELECT chunk_data FROM token_data WHERE token_id = ?", (token,))
+            row = cursor.fetchone()
+        if not row:
+            print("❌ No data found for token in DB.")
+            return "❌ No data found for this token."
+        chunks = json.loads(row[0])
+        if not chunks:
+            print("⚠️ Chunk data is empty.")
+            return "⚠️ No content available in database for this PDF."
+        qa_pairs = []
+        for i, chunk in enumerate(chunks):
+            print(f"\n🔹 Processing chunk {i+1}/{len(chunks)}")
+            questions = qgen.generate(chunk)
+            print(f"🧠 Questions generated: {questions}")
+            if not questions:
+                print("⚠️ No questions generated for this chunk.")
+                continue
+            for question in questions[:2]:  # Max 2 Qs per chunk
+                prompt = f"Context: {chunk}\n\nQuestion: {question}\n\nAnswer:"
+                print(f"➡️ Prompt:\n{prompt}")
+                try:
+                    result = qa_model(prompt, max_length=256, do_sample=False)
+                    print(f"⬅️ Raw model output: {result}")
+                    if isinstance(result, list) and "generated_text" in result[0]:
+                        answer = result[0]["generated_text"].strip()
+                    elif isinstance(result, dict) and "answer" in result:
+                        answer = result["answer"].strip()
+                    else:
+                        answer = "N/A"
+                    print(f"✅ Final Answer: {answer}")
+                    qa_pairs.append(f"Q: {question}\nA: {answer}")
+                except Exception as e:
+                    print(f"❌ QA model failed: {e}")
+                    continue
+        if not qa_pairs:
+            print("⚠️ No Q&A pairs generated.")
+            return "⚠️ No Q&A pairs generated."
+        print("✅ Final Q&A generated successfully.")
+        return "\n\n".join(qa_pairs)
+    except Exception as e:
+        print(f"🔥 Exception in generate_qa(): {e}")
+        return f"❌ Error: {str(e)}"
+# ✅ Ask question using token (semantic similarity)
+def ask_question(token, question):
+    try:
+        with sqlite3.connect("my_database.db") as conn:
+            cursor = conn.cursor()
+            cursor.execute("SELECT chunk_data FROM token_data WHERE token_id = ?", (token,))
+            row = cursor.fetchone()
+        if not row:
+            return "❌ Token not found."
+        chunks = json.loads(row[0])
+        processor = pdf_query()
+        model = processor.model
+        clean_chunks = [re.sub(r'\s+', ' ', c.strip()) for c in chunks if c.strip()]
+        if not clean_chunks:
+            return "⚠️ No valid content found in PDF."
+        chunk_embeddings = model.encode(clean_chunks)
+        q_embedding = model.encode([question])
+        scores = cosine_similarity(q_embedding, chunk_embeddings)[0]
+        top_index = int(np.argmax(scores))
+        top_score = float(scores[top_index])
+        best_text = clean_chunks[top_index]
+        return f"Q: {question}\nA: {best_text}\nScore: {round(top_score, 3)}"
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# ✅ Gradio UI
+with gr.Blocks(theme="default") as demo:
+    gr.Markdown(
+        """
+        <div style='text-align: center; padding: 1rem;'>
+            <h1 style='color: #3b82f6;'>📄 AI-Powered PDF Q&A System</h1>
+            <p style='font-size: 1.1rem;'>Upload your PDFs, generate smart questions, and get intelligent answers.</p>
+        </div>
+        """
+    )
+    with gr.Tab("📤 1. Upload PDF"):
+        gr.Markdown("### 🗂 Upload a PDF File")
+        file = gr.File(label="Upload one or more PDFs", file_types=[".pdf"], file_count="multiple")
+        upload_out = gr.Textbox(label="Upload Result", interactive=False)
+        file.change(fn=upload_pdf, inputs=file, outputs=upload_out)
+    with gr.Blocks(title="PDF Q&A Generator") as demo:
+        with gr.Tab("🧠 2. Generate Questions & Answers"):
+            gr.Markdown("### 🤖 Generate Questions and Answers from Uploaded PDF")
+            fname = gr.Textbox(label="📄 Enter Uploaded Filename", placeholder="example.pdf")
+            output_box = gr.Textbox(label="📝 Generated Q&A", lines=15, interactive=False)
+            gr.Button("🚀 Generate Q&A").click(fn=generate_qa, inputs=fname, outputs=output_box)
+    with gr.Tab("❓ 3. Ask a Question"):
+        gr.Markdown("### 💬 Ask a question based on uploaded PDF")
+        token_box = gr.Textbox(label="Token ID", placeholder="e.g., 123e4567-e89b-12d3-a456...")
+        question_box = gr.Textbox(label="Type your question", placeholder="What is the main topic discussed?")
+        answer_result = gr.Textbox(label="Answer Output", lines=6, interactive=False)
+        gr.Button("🎯 Get Answer").click(fn=ask_question, inputs=[token_box, question_box], outputs=answer_result)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

database1.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import sqlite3
+import json
+class create_db:
+    def __init__(self, token, chunk_json1, filename, full_content):
+        conn = sqlite3.connect('my_database.db')
+        cursor = conn.cursor()
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS token_data (
+                token_id TEXT PRIMARY KEY,
+                chunk_data TEXT,
+                filename TEXT,
+                full_content TEXT
+            )
+        """)
+        chunk_json = json.dumps(chunk_json1)
+        try:
+            cursor.execute(
+                "INSERT INTO token_data (token_id, chunk_data, filename, full_content) VALUES (?, ?, ?, ?)",
+                (token, chunk_json, filename, full_content)
+            )
+            conn.commit()
+            print({"message": f"✅ {filename} uploaded and stored successfully"})
+        except sqlite3.IntegrityError:
+            print({"error": f"❌ Token already exists for: {filename}"})
+        conn.close()
+    @staticmethod
+    def get_all_filenames():
+        conn = sqlite3.connect('my_database.db')
+        cursor = conn.cursor()
+        cursor.execute("SELECT filename FROM token_data")
+        rows = cursor.fetchall()
+        conn.close()
+        return {"pdfs": [{"filename": row[0]} for row in rows]}

first1.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import PyPDF2
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import re
+class pdf_query:
+    def __init__(self):
+        self.model = SentenceTransformer("all-MiniLM-L6-v2")
+        self.read = None
+    def file(self, file):
+        self.read = PyPDF2.PdfReader(file)
+    def extract_text(self):
+        text = ""
+        for page in self.read.pages:
+            content = page.extract_text()
+            if content:
+                text += content + "\n"
+        return text.strip()
+    def split_into_chunks(self, text, chunk_size=300):
+        # Split using punctuation for better sentence boundaries
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) <= chunk_size:
+                current_chunk += sentence + " "
+            else:
+                chunks.append(current_chunk.strip())
+                current_chunk = sentence + " "
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def creat_model(self,chunks):
+        model = SentenceTransformer("all-MiniLM-L6-v2")
+        chunk_embeddings = model.encode(chunks)
+        return model,chunk_embeddings
+    def answer_question(self,question, chunks, chunk_embeddings,model,threshold=0.6):
+        q_embedding = model.encode([question])  # same model as above
+        scores = cosine_similarity(q_embedding, chunk_embeddings)
+        best_score = np.max(scores)
+        best_chunk_index = np.argmax(scores)
+        if best_score >= threshold:
+            best_chunk = chunks[best_chunk_index]
+            # Clean the answer
+            cleaned_answer = re.sub(r'\s+', ' ', best_chunk.strip())
+            return  cleaned_answer
+        else:
+            return {"answer": "Answer not found in PDF"}

q_generator1.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, pipeline
+class QGenerator:
+    def __init__(self):
+        tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-hl", use_fast=False)
+        model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qg-hl")
+        self.qg = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
+    def split_sentences(self, text):
+        # Simple sentence splitting (for better results, use nltk or spacy)
+        return [s.strip() for s in text.split('.') if s.strip()]
+    def chunk_text(self, text, chunk_size=512):
+        return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+    def generate(self, text, max_questions=5):
+        questions = []
+        sentences = self.split_sentences(text)
+        for sentence in sentences:
+            if len(questions) >= max_questions:
+                break
+            input_text = f"generate question: {sentence} </s>"
+            try:
+                result = self.qg(input_text, max_length=64, num_return_sequences=1)[0]
+                question = result["generated_text"]
+                if question and question not in questions:
+                    questions.append(question)
+            except Exception as e:
+                print("Error generating question:", e)
+                continue
+        return questions

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio
+transformers
+torch
+sentence-transformers
+PyPDF2
+scikit-learn
+numpy
+uuid
+sentence_transformers
+sentencepiece
+tiktoken