| import streamlit as st |
| import io |
| import pytesseract |
| from PIL import Image |
| from docx import Document |
| from transformers import pipeline |
| from reportlab.pdfgen import canvas |
|
|
| |
| handwritten_ocr = pipeline("image-to-text", model="microsoft/trocr-base-handwritten") |
|
|
| def extract_text(image): |
| """Extracts text from both handwritten and printed documents.""" |
| try: |
| |
| handwritten_text = handwritten_ocr(image)[0]['generated_text'] |
| except Exception: |
| handwritten_text = "" |
|
|
| |
| printed_text = pytesseract.image_to_string(image) |
|
|
| |
| extracted_text = handwritten_text.strip() + "\n" + printed_text.strip() |
| return extracted_text.strip() or "No text detected." |
|
|
| |
| st.title("π Handwritten & Printed Text Extractor") |
|
|
| |
| uploaded_file = st.file_uploader("π€ Upload an image", type=["jpg", "jpeg", "png"]) |
|
|
| if uploaded_file is not None: |
| |
| image = Image.open(uploaded_file) |
| st.image(image, caption="π· Uploaded Image", use_container_width=True) |
|
|
| |
| image = image.convert("RGB") |
|
|
| |
| extracted_text = extract_text(image) |
|
|
| |
| st.subheader("π Extracted Text") |
| st.write(extracted_text) |
|
|
| |
| doc = Document() |
| doc.add_paragraph(extracted_text) |
| docx_buffer = io.BytesIO() |
| doc.save(docx_buffer) |
| docx_buffer.seek(0) |
|
|
| |
| pdf_buffer = io.BytesIO() |
| pdf = canvas.Canvas(pdf_buffer) |
| pdf.drawString(100, 750, extracted_text) |
| pdf.save() |
| pdf_buffer.seek(0) |
|
|
| |
| st.download_button("β¬οΈ Download as DOCX", data=docx_buffer, file_name="extracted_text.docx") |
| st.download_button("β¬οΈ Download as PDF", data=pdf_buffer, file_name="extracted_text.pdf", mime="application/pdf") |
|
|