import streamlit as st import io import pytesseract from PIL import Image from docx import Document from transformers import pipeline from reportlab.pdfgen import canvas # Set up OCR pipelines handwritten_ocr = pipeline("image-to-text", model="microsoft/trocr-base-handwritten") def extract_text(image): """Extracts text from both handwritten and printed documents.""" try: # Try Hugging Face model for handwriting handwritten_text = handwritten_ocr(image)[0]['generated_text'] except Exception: handwritten_text = "" # Use Tesseract for printed text printed_text = pytesseract.image_to_string(image) # Combine both results extracted_text = handwritten_text.strip() + "\n" + printed_text.strip() return extracted_text.strip() or "No text detected." # Streamlit UI st.title("📝 Handwritten & Printed Text Extractor") # File uploader uploaded_file = st.file_uploader("📤 Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: # Open and display the uploaded image image = Image.open(uploaded_file) st.image(image, caption="📷 Uploaded Image", use_container_width=True) # Convert to RGB image = image.convert("RGB") # Extract text extracted_text = extract_text(image) # Display extracted text st.subheader("📜 Extracted Text") st.write(extracted_text) # Save as DOCX doc = Document() doc.add_paragraph(extracted_text) docx_buffer = io.BytesIO() doc.save(docx_buffer) docx_buffer.seek(0) # Save as PDF pdf_buffer = io.BytesIO() pdf = canvas.Canvas(pdf_buffer) pdf.drawString(100, 750, extracted_text) pdf.save() pdf_buffer.seek(0) # Download buttons st.download_button("⬇️ Download as DOCX", data=docx_buffer, file_name="extracted_text.docx") st.download_button("⬇️ Download as PDF", data=pdf_buffer, file_name="extracted_text.pdf", mime="application/pdf")