engrrifatullah's picture
Update app.py
5043be1 verified
import streamlit as st
import io
import pytesseract
from PIL import Image
from docx import Document
from transformers import pipeline
from reportlab.pdfgen import canvas
# Set up OCR pipelines
handwritten_ocr = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
def extract_text(image):
"""Extracts text from both handwritten and printed documents."""
try:
# Try Hugging Face model for handwriting
handwritten_text = handwritten_ocr(image)[0]['generated_text']
except Exception:
handwritten_text = ""
# Use Tesseract for printed text
printed_text = pytesseract.image_to_string(image)
# Combine both results
extracted_text = handwritten_text.strip() + "\n" + printed_text.strip()
return extracted_text.strip() or "No text detected."
# Streamlit UI
st.title("πŸ“ Handwritten & Printed Text Extractor")
# File uploader
uploaded_file = st.file_uploader("πŸ“€ Upload an image", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
# Open and display the uploaded image
image = Image.open(uploaded_file)
st.image(image, caption="πŸ“· Uploaded Image", use_container_width=True)
# Convert to RGB
image = image.convert("RGB")
# Extract text
extracted_text = extract_text(image)
# Display extracted text
st.subheader("πŸ“œ Extracted Text")
st.write(extracted_text)
# Save as DOCX
doc = Document()
doc.add_paragraph(extracted_text)
docx_buffer = io.BytesIO()
doc.save(docx_buffer)
docx_buffer.seek(0)
# Save as PDF
pdf_buffer = io.BytesIO()
pdf = canvas.Canvas(pdf_buffer)
pdf.drawString(100, 750, extracted_text)
pdf.save()
pdf_buffer.seek(0)
# Download buttons
st.download_button("⬇️ Download as DOCX", data=docx_buffer, file_name="extracted_text.docx")
st.download_button("⬇️ Download as PDF", data=pdf_buffer, file_name="extracted_text.pdf", mime="application/pdf")