Spaces:
Running
Running
| import gradio as gr | |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| # Load TrOCR Model from Hugging Face | |
| processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") | |
| model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_path): | |
| images = convert_from_path(pdf_path) | |
| extracted_text = [] | |
| for img in images: | |
| # Convert image to text using TrOCR | |
| pixel_values = processor(images=img, return_tensors="pt").pixel_values | |
| generated_ids = model.generate(pixel_values) | |
| text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # Fallback to Tesseract if TrOCR fails | |
| if not text.strip(): | |
| text = pytesseract.image_to_string(img) | |
| extracted_text.append(text) | |
| return "\n".join(extracted_text) | |
| # Gradio Interface | |
| def ocr_pipeline(pdf_file): | |
| pdf_path = pdf_file.name | |
| extracted_text = extract_text_from_pdf(pdf_path) | |
| return extracted_text | |
| iface = gr.Interface( | |
| fn=ocr_pipeline, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs="text", | |
| title="PDF Text Extraction using TrOCR" | |
| ) | |
| # Run the Gradio App | |
| if __name__ == "__main__": | |
| iface.launch() | |