Spaces:

Eli-Iustus
/

Vision

Sleeping

File size: 11,790 Bytes

import streamlit as st
import torch
import torchvision.transforms as transforms
from PIL import Image
from pillow_heif import register_heif_opener
import numpy as np
import os
from io import BytesIO
from googletrans import Translator, LANGUAGES
from gtts import gTTS

# Register HEIC support for PIL
register_heif_opener()
from streamlit_cropper import st_cropper
import easyocr
st.set_page_config(page_title="INK VISION", page_icon="✨", layout="wide")

# Custom CSS for the stunning animated background and glassmorphic UI
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@500;700&family=Poppins:wght@300;400;600&display=swap');

/* Animated Gradient Background */
.stApp {
    background: linear-gradient(-45deg, #ee7752, #e73c7e, #23a6d5, #23d5ab);
    background-size: 400% 400%;
    animation: gradientBG 15s ease infinite;
    font-family: 'Poppins', sans-serif;
}

@keyframes gradientBG {
    0% { background-position: 0% 50%; }
    50% { background-position: 100% 50%; }
    100% { background-position: 0% 50%; }
}

/* Base text color to white for contrast against dark/bright backgrounds */
h1, h2, h3, p, label {
    color: #ffffff !important;
    text-shadow: 1px 1px 4px rgba(0,0,0,0.4);
}

/* Glassmorphism wrapper for header */
.glass-container {
    background: rgba(255, 255, 255, 0.1);
    border-radius: 16px;
    box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
    backdrop-filter: blur(8.5px);
    -webkit-backdrop-filter: blur(8.5px);
    border: 1px solid rgba(255, 255, 255, 0.18);
    padding: 2rem;
    margin-top: 1rem;
    margin-bottom: 2rem;
}

/* Fancy Header Font */
h1 {
    font-family: 'Orbitron', sans-serif !important;
    font-size: 3rem !important;
    text-align: center;
    background: -webkit-linear-gradient(#fff, #f0f0f0);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    margin-bottom: 0.5rem;
}

/* Stylish buttons */
div.stButton > button:first-child {
    background: linear-gradient(90deg, #ff007f 0%, #7928ca 100%);
    color: white;
    border: none;
    border-radius: 50px;
    padding: 10px 24px;
    font-weight: 600;
    font-size: 1.1rem;
    cursor: pointer;
    transition: all 0.3s ease;
    box-shadow: 0 4px 15px rgba(0,0,0,0.2);
}

div.stButton > button:first-child:hover {
    transform: translateY(-2px);
    box-shadow: 0 6px 20px rgba(0,0,0,0.3);
    background: linear-gradient(90deg, #7928ca 0%, #ff007f 100%);
    color: #ffffff !important;
}

/* File Uploader styling */
.stFileUploader > div > div {
    background: rgba(255, 255, 255, 0.05);
    border: 2px dashed rgba(255, 255, 255, 0.5);
    border-radius: 10px;
}

/* Text area styling */
.stTextArea textarea {
    background-color: rgba(255, 255, 255, 0.9) !important;
    color: #333333 !important;
    font-size: 1.5rem !important;
    font-weight: 600 !important;
    font-family: 'Poppins', sans-serif !important;
    border-radius: 10px !important;
    border: 2px solid transparent !important;
}
.stTextArea textarea:focus {
    border-color: #ff007f !important;
    box-shadow: 0 0 10px rgba(255,0,127,0.5) !important;
}

</style>

<div class="glass-container">
    <h1>✨ Handwritten Text Recognition System for Document Digitalization ✨</h1>
    <p style="text-align: center; font-size: 1.2rem;">..H..T..R..</p>
</div>
""", unsafe_allow_html=True)

from pipeline.preprocessor import DocumentPreprocessor
from pipeline.ocr_engine import HTREngine
from pipeline.postprocessor import NLPCorrector

# Initialise translator once
translator = Translator()

# Simple helpers for state
if "extracted_text" not in st.session_state:
    st.session_state["extracted_text"] = ""
if "translated_text" not in st.session_state:
    st.session_state["translated_text"] = ""
if "target_lang" not in st.session_state:
    st.session_state["target_lang"] = "en"

@st.cache_resource(show_spinner="Booting up 3-Step HTR Pipeline (CV + OCR + NLP)...")
def load_pipeline():
    p = DocumentPreprocessor()
    e = HTREngine(languages=['en'])
    n = NLPCorrector(use_ml=True)
    return p, e, n

preprocessor, engine, nlp_corrector = load_pipeline()

col1, col2 = st.columns(2)

target_image = None

with col1:
    st.markdown("### 📸 Please Upload an Image")
    input_method = st.radio("Choose Input Method", ["Upload Image", "Take a Photo"], horizontal=True)
    
    if input_method == "Upload Image":
        uploaded_file = st.file_uploader("Upload a handwritten word image", type=["png", "jpg", "jpeg", "heic", "webp"])
        if uploaded_file is not None:
            raw_image = Image.open(uploaded_file).convert("RGB")
            
            # Resize image to a standard width so both cropper and st.image match in size
            target_width = 700
            if raw_image.width != target_width:
                ratio = target_width / float(raw_image.width)
                raw_image = raw_image.resize((target_width, int(raw_image.height * ratio)))
                
            if st.checkbox("✨ Crop Image", key="crop_upload"):
                st.markdown("✨ **Crop the word below:**")
                target_image = st_cropper(raw_image, realtime_update=True, box_color='#ff007f', key="upload_crop")
            else:
                target_image = raw_image
                st.image(target_image, caption="Uploaded Image")
    else:
        camera_photo = st.camera_input("Take a picture of a handwritten word")
        if camera_photo is not None:
            raw_image = Image.open(camera_photo).convert("RGB")
            
            # Resize image to a standard width so both cropper and st.image match in size
            target_width = 700
            if raw_image.width != target_width:
                ratio = target_width / float(raw_image.width)
                raw_image = raw_image.resize((target_width, int(raw_image.height * ratio)))
                
            if st.checkbox("✨ Crop Image", key="crop_camera"):
                st.markdown("✨ **Crop the word below:**")
                target_image = st_cropper(raw_image, realtime_update=True, box_color='#ff007f', key="camera_crop")
            else:
                target_image = raw_image
                st.image(target_image, caption="Captured Image")

with col2:
    st.markdown("### 🪄 Magic Result")

    extracted_text = st.session_state.get("extracted_text", "")
    translated_text = st.session_state.get("translated_text", "")

    if target_image is not None:
        if st.button("✨ Extract Text"):
            with st.spinner("Please wait while extracting"):
                if engine is None:
                    st.error("Pipeline failed to initialize.")
                else:
                    # --- STREAM A: RAW OCR (No Preprocessing) ---
                    try:
                        raw_ocr_output = engine.extract_text(np.array(target_image))
                        raw_stream_text = nlp_corrector.correct_spelling(raw_ocr_output)
                    except Exception:
                        raw_stream_text = ""

                    # --- STREAM B: 3-STEP PIPELINE (Pre-Processed) ---
                    try:
                        # 1. Computer Vision Pre-Processing
                        cleaned_image_array = preprocessor.process(target_image)
                        # 2. Deep Learning OCR Engine
                        p_ocr_output = engine.extract_text(cleaned_image_array)
                        # 3. NLP Post-Processing
                        clean_stream_text = nlp_corrector.correct_spelling(p_ocr_output)
                    except Exception:
                        clean_stream_text = ""

                    # --- THE ENSEMBLE JUDGE ---
                    # The judge picks the version that sounds most like real English
                    extracted_text = nlp_corrector.judge_best_output(raw_stream_text, clean_stream_text)

                    if extracted_text.strip() == "":
                        st.warning("Oops! I couldn't find any text. Try a clearer image.")
                        extracted_text = ""
                    else:
                        st.success("Ensemble Magic! Winner selected from Dual-Stream analysis.")
                        with st.expander(""):
                            st.write(f"**Stream A (Raw Image):** {raw_stream_text}")
                            st.write(f"**Stream B (Cleaned Image):** {clean_stream_text}")

                    st.session_state["extracted_text"] = extracted_text
                    st.session_state["translated_text"] = ""

        # Editable original text
        st.session_state["extracted_text"] = st.text_area(
            "You can edit the result here:",
            value=st.session_state.get("extracted_text", ""),
            height=150,
        )

        st.markdown("### 🌐 Translation & Voice")

        # Language selection
        lang_keys = sorted(LANGUAGES.keys())
        default_index = lang_keys.index(st.session_state.get("target_lang", "en"))
        target_lang = st.selectbox(
            "Choose target language",
            options=lang_keys,
            index=default_index,
            format_func=lambda k: LANGUAGES[k].title(),
        )
        st.session_state["target_lang"] = target_lang

        with st.expander("Show available languages"):
            st.write(", ".join(f"{code} – {name.title()}" for code, name in LANGUAGES.items()))

        col_translate, col_speak = st.columns(2)

        with col_translate:
            if st.button("🌍 Translate into other language"):
                if st.session_state["extracted_text"].strip():
                    try:
                        result = translator.translate(
                            st.session_state["extracted_text"],
                            dest=target_lang,
                        )
                        st.session_state["translated_text"] = result.text
                    except Exception as e:
                        st.error(f"Translation failed: {e}")
                else:
                    st.warning("Please extract or type some text first.")

        with col_speak:
            if st.button("🔊 Speak text (original & translated)"):
                original = st.session_state.get("extracted_text", "").strip()
                translated = st.session_state.get("translated_text", "").strip()

                if not original and not translated:
                    st.warning("Nothing to speak. Please extract or translate text first.")
                else:
                    # Speak original (English assumed)
                    if original:
                        try:
                            buf = BytesIO()
                            gTTS(text=original, lang="en").write_to_fp(buf)
                            buf.seek(0)
                            st.audio(buf.read(), format="audio/mp3")
                        except Exception as e:
                            st.error(f"Failed to generate audio for original text: {e}")

                    # Speak translated
                    if translated:
                        try:
                            buf_tr = BytesIO()
                            gTTS(text=translated, lang=target_lang).write_to_fp(buf_tr)
                            buf_tr.seek(0)
                            st.audio(buf_tr.read(), format="audio/mp3")
                        except Exception as e:
                            st.error(f"Failed to generate audio for translated text: {e}")

        if st.session_state.get("translated_text", "").strip():
            st.text_area(
                "Translated text:",
                value=st.session_state["translated_text"],
                height=150,
            )

    else:
        st.info("Waiting for an image to work my magic...")