File size: 1,960 Bytes
47b61ab
d73dc88
5043be1
47b61ab
 
 
d73dc88
1560fe7
5043be1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47b61ab
 
5043be1
47b61ab
 
d73dc88
47b61ab
 
47e620b
47b61ab
5043be1
47b61ab
5043be1
47e620b
47b61ab
5043be1
 
47b61ab
 
d73dc88
47b61ab
 
 
 
 
d73dc88
 
 
47b61ab
 
d73dc88
 
 
 
 
47b61ab
 
d73dc88
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import streamlit as st
import io
import pytesseract
from PIL import Image
from docx import Document
from transformers import pipeline
from reportlab.pdfgen import canvas

# Set up OCR pipelines
handwritten_ocr = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")

def extract_text(image):
    """Extracts text from both handwritten and printed documents."""
    try:
        # Try Hugging Face model for handwriting
        handwritten_text = handwritten_ocr(image)[0]['generated_text']
    except Exception:
        handwritten_text = ""

    # Use Tesseract for printed text
    printed_text = pytesseract.image_to_string(image)

    # Combine both results
    extracted_text = handwritten_text.strip() + "\n" + printed_text.strip()
    return extracted_text.strip() or "No text detected."

# Streamlit UI
st.title("📝 Handwritten & Printed Text Extractor")

# File uploader
uploaded_file = st.file_uploader("📤 Upload an image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    # Open and display the uploaded image
    image = Image.open(uploaded_file)
    st.image(image, caption="📷 Uploaded Image", use_container_width=True)

    # Convert to RGB
    image = image.convert("RGB")

    # Extract text
    extracted_text = extract_text(image)

    # Display extracted text
    st.subheader("📜 Extracted Text")
    st.write(extracted_text)

    # Save as DOCX
    doc = Document()
    doc.add_paragraph(extracted_text)
    docx_buffer = io.BytesIO()
    doc.save(docx_buffer)
    docx_buffer.seek(0)

    # Save as PDF
    pdf_buffer = io.BytesIO()
    pdf = canvas.Canvas(pdf_buffer)
    pdf.drawString(100, 750, extracted_text)
    pdf.save()
    pdf_buffer.seek(0)

    # Download buttons
    st.download_button("⬇️ Download as DOCX", data=docx_buffer, file_name="extracted_text.docx")
    st.download_button("⬇️ Download as PDF", data=pdf_buffer, file_name="extracted_text.pdf", mime="application/pdf")