import streamlit as st
import io
import pytesseract
from PIL import Image
from docx import Document
from transformers import pipeline
from reportlab.pdfgen import canvas

# Set up OCR pipelines
handwritten_ocr = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")

def extract_text(image):
    """Extracts text from both handwritten and printed documents."""
    try:
        # Try Hugging Face model for handwriting
        handwritten_text = handwritten_ocr(image)[0]['generated_text']
    except Exception:
        handwritten_text = ""

    # Use Tesseract for printed text
    printed_text = pytesseract.image_to_string(image)

    # Combine both results
    extracted_text = handwritten_text.strip() + "\n" + printed_text.strip()
    return extracted_text.strip() or "No text detected."

# Streamlit UI
st.title("📝 Handwritten & Printed Text Extractor")

# File uploader
uploaded_file = st.file_uploader("📤 Upload an image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    # Open and display the uploaded image
    image = Image.open(uploaded_file)
    st.image(image, caption="📷 Uploaded Image", use_container_width=True)

    # Convert to RGB
    image = image.convert("RGB")

    # Extract text
    extracted_text = extract_text(image)

    # Display extracted text
    st.subheader("📜 Extracted Text")
    st.write(extracted_text)

    # Save as DOCX
    doc = Document()
    doc.add_paragraph(extracted_text)
    docx_buffer = io.BytesIO()
    doc.save(docx_buffer)
    docx_buffer.seek(0)

    # Save as PDF
    pdf_buffer = io.BytesIO()
    pdf = canvas.Canvas(pdf_buffer)
    pdf.drawString(100, 750, extracted_text)
    pdf.save()
    pdf_buffer.seek(0)

    # Download buttons
    st.download_button("⬇️ Download as DOCX", data=docx_buffer, file_name="extracted_text.docx")
    st.download_button("⬇️ Download as PDF", data=pdf_buffer, file_name="extracted_text.pdf", mime="application/pdf")