import gradio as gr import torch from pypdf import PdfReader from PIL import Image import io from transformers import ( TrOCRProcessor, VisionEncoderDecoderModel, AutoTokenizer, AutoModelForCausalLM ) # ============================================================ # Device # ============================================================ device = "cuda" if torch.cuda.is_available() else "cpu" # ============================================================ # Load Models (cached by HF Spaces) # ============================================================ ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") ocr_model = VisionEncoderDecoderModel.from_pretrained( "microsoft/trocr-base-printed" ).to(device) tokenizer = AutoTokenizer.from_pretrained( "Qwen/Qwen2.5-1.5B-Instruct", trust_remote_code=True ) qwen_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2.5-1.5B-Instruct", device_map="auto", torch_dtype=torch.float16 if device == "cuda" else torch.float32, trust_remote_code=True ) # ============================================================ # Helpers # ============================================================ def is_scanned_pdf(reader): for page in reader.pages: if page.extract_text(): return False return True def extract_text_from_pdf(file): reader = PdfReader(file) scanned = is_scanned_pdf(reader) extracted_text = [] if not scanned: # Digital PDF for page in reader.pages: text = page.extract_text() if text: extracted_text.append(text) else: # OCR only embedded images (HF-safe) for page in reader.pages: if "/XObject" in page["/Resources"]: xobjects = page["/Resources"]["/XObject"].get_object() for obj in xobjects: xobj = xobjects[obj] if xobj["/Subtype"] == "/Image": image = Image.open(io.BytesIO(xobj.get_data())).convert("RGB") pixel_values = ocr_processor( images=image, return_tensors="pt" ).pixel_values.to(device) with torch.no_grad(): ids = ocr_model.generate(pixel_values) text = ocr_processor.batch_decode( ids, skip_special_tokens=True )[0] extracted_text.append(text) return "\n\n".join(extracted_text) def evaluate_text(text): prompt = f""" You are a strict academic evaluator. Evaluate the following document and assign marks out of 10. Criteria: - Clarity - Structure - Technical depth - Language quality - Completeness DOCUMENT: --------- {text[:6000]} --------- Respond strictly in this format: Score: X/10 Justification: Strengths: Weaknesses: """ inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): output = qwen_model.generate( **inputs, max_new_tokens=400, do_sample=False ) return tokenizer.decode(output[0], skip_special_tokens=True) # ============================================================ # Gradio Function # ============================================================ def process_pdf(pdf_file): extracted_text = extract_text_from_pdf(pdf_file) evaluation = evaluate_text(extracted_text) return extracted_text, evaluation # ============================================================ # Gradio UI # ============================================================ with gr.Blocks(title="PDF Evaluator (OCR + Qwen)") as demo: gr.Markdown(""" # 📄 PDF Evaluator Upload a PDF to: - Extract text (OCR if needed) - Evaluate content using Qwen - Get marks out of 10 """) pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) extract_btn = gr.Button("Extract & Evaluate") extracted_output = gr.Textbox( label="Extracted Text", lines=20 ) evaluation_output = gr.Textbox( label="Evaluation", lines=10 ) extract_btn.click( process_pdf, inputs=pdf_input, outputs=[extracted_output, evaluation_output] ) demo.launch()