Spaces:

engrrifatullah
/

Image_text_extractor

Running

engrrifatullah commited on Feb 10, 2025

Commit

47b61ab

verified ·

1 Parent(s): f395fbd

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+import streamlit as st
+import pytesseract
+import cv2
+import numpy as np
+from PIL import Image
+import pdfkit
+from docx import Document
+from transformers import pipeline
+# Set up OCR pipeline (you can replace with Hugging Face model)
+ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
+# Streamlit UI
+st.title("Handwritten Text Extractor")
+# File uploader
+uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
+if uploaded_file is not None:
+    image = Image.open(uploaded_file)
+    st.image(image, caption="Uploaded Image", use_column_width=True)
+    # Convert image to numpy array
+    image_np = np.array(image)
+    # Extract text using OCR model
+    extracted_text = ocr_pipeline(image_np)[0]['generated_text']
+    # Display extracted text
+    st.subheader("Extracted Text")
+    st.write(extracted_text)
+    # Save as DOCX
+    doc = Document()
+    doc.add_paragraph(extracted_text)
+    docx_filename = "extracted_text.docx"
+    doc.save(docx_filename)
+    # Save as PDF
+    pdf_filename = "extracted_text.pdf"
+    pdfkit.from_string(extracted_text, pdf_filename)
+    # Download buttons
+    st.download_button("Download as DOCX", data=open(docx_filename, "rb"), file_name=docx_filename)
+    st.download_button("Download as PDF", data=open(pdf_filename, "rb"), file_name=pdf_filename)