Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import pypdfium2 | |
| from PIL import Image | |
| from transformers import Qwen2VLProcessor, Qwen2VLImageProcessor, AutoTokenizer, Qwen2VLModel | |
| # Load model and processor | |
| model_name = "Qwen/Qwen-VL" # You may replace with your preferred VL model | |
| image_processor = Qwen2VLImageProcessor.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| processor = Qwen2VLProcessor(image_processor=image_processor, tokenizer=tokenizer) | |
| model = Qwen2VLModel.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
| ) | |
| model.eval() | |
| # Convert PDF to list of PIL images (one per page) | |
| def pdf_to_images(pdf_path): | |
| pdf = pypdfium2.PdfDocument(pdf_path) | |
| return [page.render().to_pil() for page in pdf] | |
| # Generate text from each image using the vision-language model | |
| def process_pdf(pdf_file): | |
| images = pdf_to_images(pdf_file.name) | |
| results = [] | |
| for image in images: | |
| inputs = processor(images=image, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate(**inputs, max_new_tokens=256) | |
| text = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
| results.append(text.strip()) | |
| return "\n\n".join(results) | |
| # Gradio UI | |
| demo = gr.Interface( | |
| fn=process_pdf, | |
| inputs=gr.File(type="file", file_types=[".pdf"]), | |
| outputs="text", | |
| title="olmOCR PDF Processor" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |