import gradio as gr from transformers import pipeline # Step 1: OCR model (extracts text from the image) ocr = pipeline("image-to-text", model="microsoft/trocr-base-printed") # Step 2: QA model (answers questions about extracted text) qa = pipeline("question-answering", model="deepset/roberta-base-squad2") def process(image, question): try: # Extract text from the image extracted_text = ocr(image)[0]["generated_text"] # If no question is asked, just return the extracted text if not question: return extracted_text, "Please enter a question." # Run QA on the extracted text answer = qa(question=question, context=extracted_text) return extracted_text, answer.get("answer", "No answer found.") except Exception as e: return "Error during processing.", str(e) # Gradio Interface demo = gr.Interface( fn=process, inputs=[ gr.Image(type="pil", label="Upload an image"), gr.Textbox(label="Ask a question about the document") ], outputs=[ gr.Textbox(label="Extracted Text"), gr.Textbox(label="Answer") ], title="OCR + Question Answering", description="Upload a document image, extract text, and ask questions about it." ) if __name__ == "__main__": demo.launch()