NNEngine's picture
Initial PDF evaluator app
0712e1f
import gradio as gr
import torch
from pypdf import PdfReader
from PIL import Image
import io
from transformers import (
TrOCRProcessor,
VisionEncoderDecoderModel,
AutoTokenizer,
AutoModelForCausalLM
)
# ============================================================
# Device
# ============================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
# ============================================================
# Load Models (cached by HF Spaces)
# ============================================================
ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
ocr_model = VisionEncoderDecoderModel.from_pretrained(
"microsoft/trocr-base-printed"
).to(device)
tokenizer = AutoTokenizer.from_pretrained(
"Qwen/Qwen2.5-1.5B-Instruct",
trust_remote_code=True
)
qwen_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2.5-1.5B-Instruct",
device_map="auto",
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
trust_remote_code=True
)
# ============================================================
# Helpers
# ============================================================
def is_scanned_pdf(reader):
for page in reader.pages:
if page.extract_text():
return False
return True
def extract_text_from_pdf(file):
reader = PdfReader(file)
scanned = is_scanned_pdf(reader)
extracted_text = []
if not scanned:
# Digital PDF
for page in reader.pages:
text = page.extract_text()
if text:
extracted_text.append(text)
else:
# OCR only embedded images (HF-safe)
for page in reader.pages:
if "/XObject" in page["/Resources"]:
xobjects = page["/Resources"]["/XObject"].get_object()
for obj in xobjects:
xobj = xobjects[obj]
if xobj["/Subtype"] == "/Image":
image = Image.open(io.BytesIO(xobj.get_data())).convert("RGB")
pixel_values = ocr_processor(
images=image,
return_tensors="pt"
).pixel_values.to(device)
with torch.no_grad():
ids = ocr_model.generate(pixel_values)
text = ocr_processor.batch_decode(
ids,
skip_special_tokens=True
)[0]
extracted_text.append(text)
return "\n\n".join(extracted_text)
def evaluate_text(text):
prompt = f"""
You are a strict academic evaluator.
Evaluate the following document and assign marks out of 10.
Criteria:
- Clarity
- Structure
- Technical depth
- Language quality
- Completeness
DOCUMENT:
---------
{text[:6000]}
---------
Respond strictly in this format:
Score: X/10
Justification:
Strengths:
Weaknesses:
"""
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
output = qwen_model.generate(
**inputs,
max_new_tokens=400,
do_sample=False
)
return tokenizer.decode(output[0], skip_special_tokens=True)
# ============================================================
# Gradio Function
# ============================================================
def process_pdf(pdf_file):
extracted_text = extract_text_from_pdf(pdf_file)
evaluation = evaluate_text(extracted_text)
return extracted_text, evaluation
# ============================================================
# Gradio UI
# ============================================================
with gr.Blocks(title="PDF Evaluator (OCR + Qwen)") as demo:
gr.Markdown("""
# 📄 PDF Evaluator
Upload a PDF to:
- Extract text (OCR if needed)
- Evaluate content using Qwen
- Get marks out of 10
""")
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
extract_btn = gr.Button("Extract & Evaluate")
extracted_output = gr.Textbox(
label="Extracted Text",
lines=20
)
evaluation_output = gr.Textbox(
label="Evaluation",
lines=10
)
extract_btn.click(
process_pdf,
inputs=pdf_input,
outputs=[extracted_output, evaluation_output]
)
demo.launch()