Spaces:

engrrifatullah
/

Image_text_extractor

Running

App Files Files Community

Image_text_extractor / app.py

engrrifatullah

Update app.py

5043be1 verified over 1 year ago

raw

history blame contribute delete

1.96 kB

	import streamlit as st
	import io
	import pytesseract
	from PIL import Image
	from docx import Document
	from transformers import pipeline
	from reportlab.pdfgen import canvas

	# Set up OCR pipelines
	handwritten_ocr = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")

	def extract_text(image):
	"""Extracts text from both handwritten and printed documents."""
	try:
	# Try Hugging Face model for handwriting
	handwritten_text = handwritten_ocr(image)[0]['generated_text']
	except Exception:
	handwritten_text = ""

	# Use Tesseract for printed text
	printed_text = pytesseract.image_to_string(image)

	# Combine both results
	extracted_text = handwritten_text.strip() + "\n" + printed_text.strip()
	return extracted_text.strip() or "No text detected."

	# Streamlit UI
	st.title("📝 Handwritten & Printed Text Extractor")

	# File uploader
	uploaded_file = st.file_uploader("📤 Upload an image", type=["jpg", "jpeg", "png"])

	if uploaded_file is not None:
	# Open and display the uploaded image
	image = Image.open(uploaded_file)
	st.image(image, caption="📷 Uploaded Image", use_container_width=True)

	# Convert to RGB
	image = image.convert("RGB")

	# Extract text
	extracted_text = extract_text(image)

	# Display extracted text
	st.subheader("📜 Extracted Text")
	st.write(extracted_text)

	# Save as DOCX
	doc = Document()
	doc.add_paragraph(extracted_text)
	docx_buffer = io.BytesIO()
	doc.save(docx_buffer)
	docx_buffer.seek(0)

	# Save as PDF
	pdf_buffer = io.BytesIO()
	pdf = canvas.Canvas(pdf_buffer)
	pdf.drawString(100, 750, extracted_text)
	pdf.save()
	pdf_buffer.seek(0)

	# Download buttons
	st.download_button("⬇️ Download as DOCX", data=docx_buffer, file_name="extracted_text.docx")
	st.download_button("⬇️ Download as PDF", data=pdf_buffer, file_name="extracted_text.pdf", mime="application/pdf")