import streamlit as st
import torch
import torchvision.transforms as transforms
from PIL import Image
from pillow_heif import register_heif_opener
import numpy as np
import os
from io import BytesIO
from googletrans import Translator, LANGUAGES
from gtts import gTTS
# Register HEIC support for PIL
register_heif_opener()
from streamlit_cropper import st_cropper
import easyocr
st.set_page_config(page_title="INK VISION", page_icon="✨", layout="wide")
# Custom CSS for the stunning animated background and glassmorphic UI
st.markdown("""
✨ Handwritten Text Recognition System for Document Digitalization ✨
..H..T..R..
""", unsafe_allow_html=True)
from pipeline.preprocessor import DocumentPreprocessor
from pipeline.ocr_engine import HTREngine
from pipeline.postprocessor import NLPCorrector
# Initialise translator once
translator = Translator()
# Simple helpers for state
if "extracted_text" not in st.session_state:
st.session_state["extracted_text"] = ""
if "translated_text" not in st.session_state:
st.session_state["translated_text"] = ""
if "target_lang" not in st.session_state:
st.session_state["target_lang"] = "en"
@st.cache_resource(show_spinner="Booting up 3-Step HTR Pipeline (CV + OCR + NLP)...")
def load_pipeline():
p = DocumentPreprocessor()
e = HTREngine(languages=['en'])
n = NLPCorrector(use_ml=True)
return p, e, n
preprocessor, engine, nlp_corrector = load_pipeline()
col1, col2 = st.columns(2)
target_image = None
with col1:
st.markdown("### 📸 Please Upload an Image")
input_method = st.radio("Choose Input Method", ["Upload Image", "Take a Photo"], horizontal=True)
if input_method == "Upload Image":
uploaded_file = st.file_uploader("Upload a handwritten word image", type=["png", "jpg", "jpeg", "heic", "webp"])
if uploaded_file is not None:
raw_image = Image.open(uploaded_file).convert("RGB")
# Resize image to a standard width so both cropper and st.image match in size
target_width = 700
if raw_image.width != target_width:
ratio = target_width / float(raw_image.width)
raw_image = raw_image.resize((target_width, int(raw_image.height * ratio)))
if st.checkbox("✨ Crop Image", key="crop_upload"):
st.markdown("✨ **Crop the word below:**")
target_image = st_cropper(raw_image, realtime_update=True, box_color='#ff007f', key="upload_crop")
else:
target_image = raw_image
st.image(target_image, caption="Uploaded Image")
else:
camera_photo = st.camera_input("Take a picture of a handwritten word")
if camera_photo is not None:
raw_image = Image.open(camera_photo).convert("RGB")
# Resize image to a standard width so both cropper and st.image match in size
target_width = 700
if raw_image.width != target_width:
ratio = target_width / float(raw_image.width)
raw_image = raw_image.resize((target_width, int(raw_image.height * ratio)))
if st.checkbox("✨ Crop Image", key="crop_camera"):
st.markdown("✨ **Crop the word below:**")
target_image = st_cropper(raw_image, realtime_update=True, box_color='#ff007f', key="camera_crop")
else:
target_image = raw_image
st.image(target_image, caption="Captured Image")
with col2:
st.markdown("### 🪄 Magic Result")
extracted_text = st.session_state.get("extracted_text", "")
translated_text = st.session_state.get("translated_text", "")
if target_image is not None:
if st.button("✨ Extract Text"):
with st.spinner("Please wait while extracting"):
if engine is None:
st.error("Pipeline failed to initialize.")
else:
# --- STREAM A: RAW OCR (No Preprocessing) ---
try:
raw_ocr_output = engine.extract_text(np.array(target_image))
raw_stream_text = nlp_corrector.correct_spelling(raw_ocr_output)
except Exception:
raw_stream_text = ""
# --- STREAM B: 3-STEP PIPELINE (Pre-Processed) ---
try:
# 1. Computer Vision Pre-Processing
cleaned_image_array = preprocessor.process(target_image)
# 2. Deep Learning OCR Engine
p_ocr_output = engine.extract_text(cleaned_image_array)
# 3. NLP Post-Processing
clean_stream_text = nlp_corrector.correct_spelling(p_ocr_output)
except Exception:
clean_stream_text = ""
# --- THE ENSEMBLE JUDGE ---
# The judge picks the version that sounds most like real English
extracted_text = nlp_corrector.judge_best_output(raw_stream_text, clean_stream_text)
if extracted_text.strip() == "":
st.warning("Oops! I couldn't find any text. Try a clearer image.")
extracted_text = ""
else:
st.success("Ensemble Magic! Winner selected from Dual-Stream analysis.")
with st.expander(""):
st.write(f"**Stream A (Raw Image):** {raw_stream_text}")
st.write(f"**Stream B (Cleaned Image):** {clean_stream_text}")
st.session_state["extracted_text"] = extracted_text
st.session_state["translated_text"] = ""
# Editable original text
st.session_state["extracted_text"] = st.text_area(
"You can edit the result here:",
value=st.session_state.get("extracted_text", ""),
height=150,
)
st.markdown("### 🌐 Translation & Voice")
# Language selection
lang_keys = sorted(LANGUAGES.keys())
default_index = lang_keys.index(st.session_state.get("target_lang", "en"))
target_lang = st.selectbox(
"Choose target language",
options=lang_keys,
index=default_index,
format_func=lambda k: LANGUAGES[k].title(),
)
st.session_state["target_lang"] = target_lang
with st.expander("Show available languages"):
st.write(", ".join(f"{code} – {name.title()}" for code, name in LANGUAGES.items()))
col_translate, col_speak = st.columns(2)
with col_translate:
if st.button("🌍 Translate into other language"):
if st.session_state["extracted_text"].strip():
try:
result = translator.translate(
st.session_state["extracted_text"],
dest=target_lang,
)
st.session_state["translated_text"] = result.text
except Exception as e:
st.error(f"Translation failed: {e}")
else:
st.warning("Please extract or type some text first.")
with col_speak:
if st.button("🔊 Speak text (original & translated)"):
original = st.session_state.get("extracted_text", "").strip()
translated = st.session_state.get("translated_text", "").strip()
if not original and not translated:
st.warning("Nothing to speak. Please extract or translate text first.")
else:
# Speak original (English assumed)
if original:
try:
buf = BytesIO()
gTTS(text=original, lang="en").write_to_fp(buf)
buf.seek(0)
st.audio(buf.read(), format="audio/mp3")
except Exception as e:
st.error(f"Failed to generate audio for original text: {e}")
# Speak translated
if translated:
try:
buf_tr = BytesIO()
gTTS(text=translated, lang=target_lang).write_to_fp(buf_tr)
buf_tr.seek(0)
st.audio(buf_tr.read(), format="audio/mp3")
except Exception as e:
st.error(f"Failed to generate audio for translated text: {e}")
if st.session_state.get("translated_text", "").strip():
st.text_area(
"Translated text:",
value=st.session_state["translated_text"],
height=150,
)
else:
st.info("Waiting for an image to work my magic...")