Spaces:

engrrifatullah
/

Image_text_extractor

Sleeping

App Files Files Community

engrrifatullah commited on Feb 10, 2025

Commit

d73dc88

verified ·

1 Parent(s): 348d51a

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -15

app.py CHANGED Viewed

@@ -1,25 +1,23 @@
 import streamlit as st
-import pdfkit
 from PIL import Image
 from docx import Document
 from transformers import pipeline
-# Configure pdfkit with wkhtmltopdf (Ensure wkhtmltopdf is installed)
-config = pdfkit.configuration(wkhtmltopdf='/usr/bin/wkhtmltopdf')  # Adjust path if necessary
 # Set up OCR pipeline from Hugging Face
 ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
 # Streamlit UI
-st.title("Handwritten Text Extractor")
 # File uploader
-uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
 if uploaded_file is not None:
     # Open and display the uploaded image
     image = Image.open(uploaded_file)
-    st.image(image, caption="Uploaded Image", use_column_width=True)
     # Convert the image to RGB format
     image = image.convert("RGB")
@@ -28,20 +26,23 @@ if uploaded_file is not None:
     extracted_text = ocr_pipeline(image)[0]['generated_text']
     # Display extracted text
-    st.subheader("Extracted Text")
     st.write(extracted_text)
     # Save as DOCX
     doc = Document()
     doc.add_paragraph(extracted_text)
-    docx_filename = "extracted_text.docx"
-    doc.save(docx_filename)
     # Save as PDF
-    pdf_filename = "extracted_text.pdf"
-    pdfkit.from_string(extracted_text, pdf_filename, configuration=config)
     # Download buttons
-    st.download_button("Download as DOCX", data=open(docx_filename, "rb"), file_name=docx_filename)
-    st.download_button("Download as PDF", data=open(pdf_filename, "rb"), file_name=pdf_filename)

 import streamlit as st
+import io
 from PIL import Image
 from docx import Document
 from transformers import pipeline
+from reportlab.pdfgen import canvas
 # Set up OCR pipeline from Hugging Face
 ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
 # Streamlit UI
+st.title("📝 Handwritten Text Extractor")
 # File uploader
+uploaded_file = st.file_uploader("📤 Upload an image", type=["jpg", "jpeg", "png"])
 if uploaded_file is not None:
     # Open and display the uploaded image
     image = Image.open(uploaded_file)
+    st.image(image, caption="📷 Uploaded Image", use_column_width=True)
     # Convert the image to RGB format
     image = image.convert("RGB")
     extracted_text = ocr_pipeline(image)[0]['generated_text']
     # Display extracted text
+    st.subheader("📜 Extracted Text")
     st.write(extracted_text)
     # Save as DOCX
     doc = Document()
     doc.add_paragraph(extracted_text)
+    docx_buffer = io.BytesIO()
+    doc.save(docx_buffer)
+    docx_buffer.seek(0)
     # Save as PDF
+    pdf_buffer = io.BytesIO()
+    pdf = canvas.Canvas(pdf_buffer)
+    pdf.drawString(100, 750, extracted_text)
+    pdf.save()
+    pdf_buffer.seek(0)
     # Download buttons
+    st.download_button("⬇️ Download as DOCX", data=docx_buffer, file_name="extracted_text.docx")
+    st.download_button("⬇️ Download as PDF", data=pdf_buffer, file_name="extracted_text.pdf", mime="application/pdf")