Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from pypdf import PdfReader | |
| from PIL import Image | |
| import io | |
| from transformers import ( | |
| TrOCRProcessor, | |
| VisionEncoderDecoderModel, | |
| AutoTokenizer, | |
| AutoModelForCausalLM | |
| ) | |
| # ============================================================ | |
| # Device | |
| # ============================================================ | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # ============================================================ | |
| # Load Models (cached by HF Spaces) | |
| # ============================================================ | |
| ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") | |
| ocr_model = VisionEncoderDecoderModel.from_pretrained( | |
| "microsoft/trocr-base-printed" | |
| ).to(device) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "Qwen/Qwen2.5-1.5B-Instruct", | |
| trust_remote_code=True | |
| ) | |
| qwen_model = AutoModelForCausalLM.from_pretrained( | |
| "Qwen/Qwen2.5-1.5B-Instruct", | |
| device_map="auto", | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| trust_remote_code=True | |
| ) | |
| # ============================================================ | |
| # Helpers | |
| # ============================================================ | |
| def is_scanned_pdf(reader): | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| return False | |
| return True | |
| def extract_text_from_pdf(file): | |
| reader = PdfReader(file) | |
| scanned = is_scanned_pdf(reader) | |
| extracted_text = [] | |
| if not scanned: | |
| # Digital PDF | |
| for page in reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| extracted_text.append(text) | |
| else: | |
| # OCR only embedded images (HF-safe) | |
| for page in reader.pages: | |
| if "/XObject" in page["/Resources"]: | |
| xobjects = page["/Resources"]["/XObject"].get_object() | |
| for obj in xobjects: | |
| xobj = xobjects[obj] | |
| if xobj["/Subtype"] == "/Image": | |
| image = Image.open(io.BytesIO(xobj.get_data())).convert("RGB") | |
| pixel_values = ocr_processor( | |
| images=image, | |
| return_tensors="pt" | |
| ).pixel_values.to(device) | |
| with torch.no_grad(): | |
| ids = ocr_model.generate(pixel_values) | |
| text = ocr_processor.batch_decode( | |
| ids, | |
| skip_special_tokens=True | |
| )[0] | |
| extracted_text.append(text) | |
| return "\n\n".join(extracted_text) | |
| def evaluate_text(text): | |
| prompt = f""" | |
| You are a strict academic evaluator. | |
| Evaluate the following document and assign marks out of 10. | |
| Criteria: | |
| - Clarity | |
| - Structure | |
| - Technical depth | |
| - Language quality | |
| - Completeness | |
| DOCUMENT: | |
| --------- | |
| {text[:6000]} | |
| --------- | |
| Respond strictly in this format: | |
| Score: X/10 | |
| Justification: | |
| Strengths: | |
| Weaknesses: | |
| """ | |
| inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| output = qwen_model.generate( | |
| **inputs, | |
| max_new_tokens=400, | |
| do_sample=False | |
| ) | |
| return tokenizer.decode(output[0], skip_special_tokens=True) | |
| # ============================================================ | |
| # Gradio Function | |
| # ============================================================ | |
| def process_pdf(pdf_file): | |
| extracted_text = extract_text_from_pdf(pdf_file) | |
| evaluation = evaluate_text(extracted_text) | |
| return extracted_text, evaluation | |
| # ============================================================ | |
| # Gradio UI | |
| # ============================================================ | |
| with gr.Blocks(title="PDF Evaluator (OCR + Qwen)") as demo: | |
| gr.Markdown(""" | |
| # 📄 PDF Evaluator | |
| Upload a PDF to: | |
| - Extract text (OCR if needed) | |
| - Evaluate content using Qwen | |
| - Get marks out of 10 | |
| """) | |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| extract_btn = gr.Button("Extract & Evaluate") | |
| extracted_output = gr.Textbox( | |
| label="Extracted Text", | |
| lines=20 | |
| ) | |
| evaluation_output = gr.Textbox( | |
| label="Evaluation", | |
| lines=10 | |
| ) | |
| extract_btn.click( | |
| process_pdf, | |
| inputs=pdf_input, | |
| outputs=[extracted_output, evaluation_output] | |
| ) | |
| demo.launch() | |