Spaces:

prat1003
/

project2

Sleeping

App Files Files Community

prat1003 commited on Oct 13, 2025

Commit

2085bbf

verified ·

1 Parent(s): e7b5f58

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -41

app.py CHANGED Viewed

@@ -1,20 +1,43 @@
 import gradio as gr
-from pdf2image import convert_from_path
-import pytesseract
-from transformers import pipeline
-import json
 import tempfile
 import shutil
 import os
-import easyocr
 import numpy as np
 reader = easyocr.Reader(['en'])
 def extract_text_from_scanned_pdf(file_path):
-    pages = convert_from_path(file_path)
     text = ""
     for page in pages:
         img_array = np.array(page)
@@ -22,43 +45,37 @@ def extract_text_from_scanned_pdf(file_path):
         text += " ".join(result) + "\n"
     return text.strip()
-# 🧠 Load lightweight question generation model
-qg_pipeline = pipeline(
-    "text2text-generation",
-    model="valhalla/t5-small-qg-prepend",
-    tokenizer="t5-small"
-)
-# 🧩 OCR function: extract text from scanned PDFs
-#def extract_text_from_scanned_pdf(file_path):
-#   pages = convert_from_path(file_path)
-#    text = ""
-#    for page in pages:
-#        text += pytesseract.image_to_string(page)
-#   return text.strip()
-# ⚙️ Main processing function
 def process_pdf(pdf_file):
-    # Step 1️⃣: Copy uploaded file to a temporary location
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
         shutil.copy(pdf_file.name, temp_pdf.name)
         temp_pdf_path = temp_pdf.name
-    # Step 2️⃣: Extract text using OCR
-    extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
-    os.remove(temp_pdf_path)
     if not extracted_text.strip():
-        return "❌ Could not extract text. Make sure the PDF has readable text."
-    # Step 3️⃣: Generate questions from extracted text
-    prompt = "generate questions: " + extracted_text[:1000]  # limit to 1000 chars
-    questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)
-    # Step 4️⃣: Convert model output into question list
     question_list = []
     for q in questions_output:
         question_list.append({
@@ -71,7 +88,7 @@ def process_pdf(pdf_file):
             ]
         })
-    # Step 5️⃣: Build the <questiondata> structure
     data = {
         "title": "Certification Title",
         "totalmarks": "50",
@@ -84,17 +101,19 @@ def process_pdf(pdf_file):
         "maxattempts": 3
     }
-    # Step 6️⃣: Wrap JSON inside XML CDATA
     xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
     return xml_output
-# 🚀 Gradio Web UI
 iface = gr.Interface(
     fn=process_pdf,
-    inputs=gr.File(label="📄 Upload your scanned PDF"),
     outputs="text",
     title="PDF to Question Generator (with OCR)",
-    description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
 )
 iface.launch()

 import gradio as gr
 import tempfile
 import shutil
 import os
+import json
 import numpy as np
+from pdf2image import convert_from_path
+import easyocr
+from PyPDF2 import PdfReader
+from transformers import pipeline
+# -----------------------------
+# Initialize OCR and Transformers
+# -----------------------------
 reader = easyocr.Reader(['en'])
+qg_pipeline = pipeline(
+    "text2text-generation",
+    model="valhalla/t5-small-qg-prepend",
+    tokenizer="t5-small"
+)
+# -----------------------------
+# Extract text from selectable PDFs
+# -----------------------------
+def extract_text_from_pdf(file_path):
+    reader_pdf = PdfReader(file_path)
+    text = ""
+    for page in reader_pdf.pages:
+        t = page.extract_text()
+        if t:
+            text += t + "\n"
+    return text.strip()
+# -----------------------------
+# Extract text from scanned PDFs using EasyOCR
+# -----------------------------
 def extract_text_from_scanned_pdf(file_path):
+    # Reduce DPI for faster processing
+    pages = convert_from_path(file_path, dpi=150)
     text = ""
     for page in pages:
         img_array = np.array(page)
         text += " ".join(result) + "\n"
     return text.strip()
+# -----------------------------
+# Main processing function
+# -----------------------------
 def process_pdf(pdf_file):
+    # Save uploaded PDF to temp file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
         shutil.copy(pdf_file.name, temp_pdf.name)
         temp_pdf_path = temp_pdf.name
+    # Step 1: Try extracting text from PDF directly
+    extracted_text = extract_text_from_pdf(temp_pdf_path)
+    # Step 2: If empty, use OCR
     if not extracted_text.strip():
+        extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
+    os.remove(temp_pdf_path)
+    if not extracted_text.strip():
+        return "❌ Could not extract text. Make sure the PDF has readable content."
+    # Step 3: Generate questions with beam search (3 questions)
+    prompt = "generate questions: " + extracted_text[:1000]  # limit to first 1000 chars
+    questions_output = qg_pipeline(
+        prompt,
+        max_length=128,
+        num_beams=3,            # beam search
+        num_return_sequences=3
+    )
+    # Step 4: Build question list
     question_list = []
     for q in questions_output:
         question_list.append({
             ]
         })
+    # Step 5: Build <questiondata> structure
     data = {
         "title": "Certification Title",
         "totalmarks": "50",
         "maxattempts": 3
     }
+    # Step 6: Wrap JSON in XML CDATA
     xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
     return xml_output
+# -----------------------------
+# Gradio Interface
+# -----------------------------
 iface = gr.Interface(
     fn=process_pdf,
+    inputs=gr.File(label="📄 Upload your PDF"),
     outputs="text",
     title="PDF to Question Generator (with OCR)",
+    description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates <questiondata> XML for quizzes."
 )
 iface.launch()