Vision / app.py
Eli-Iustus's picture
Update app.py
a8bb7e9 verified
import streamlit as st
import torch
import torchvision.transforms as transforms
from PIL import Image
from pillow_heif import register_heif_opener
import numpy as np
import os
from io import BytesIO
from googletrans import Translator, LANGUAGES
from gtts import gTTS
# Register HEIC support for PIL
register_heif_opener()
from streamlit_cropper import st_cropper
import easyocr
st.set_page_config(page_title="INK VISION", page_icon="✨", layout="wide")
# Custom CSS for the stunning animated background and glassmorphic UI
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@500;700&family=Poppins:wght@300;400;600&display=swap');
/* Animated Gradient Background */
.stApp {
background: linear-gradient(-45deg, #ee7752, #e73c7e, #23a6d5, #23d5ab);
background-size: 400% 400%;
animation: gradientBG 15s ease infinite;
font-family: 'Poppins', sans-serif;
}
@keyframes gradientBG {
0% { background-position: 0% 50%; }
50% { background-position: 100% 50%; }
100% { background-position: 0% 50%; }
}
/* Base text color to white for contrast against dark/bright backgrounds */
h1, h2, h3, p, label {
color: #ffffff !important;
text-shadow: 1px 1px 4px rgba(0,0,0,0.4);
}
/* Glassmorphism wrapper for header */
.glass-container {
background: rgba(255, 255, 255, 0.1);
border-radius: 16px;
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
backdrop-filter: blur(8.5px);
-webkit-backdrop-filter: blur(8.5px);
border: 1px solid rgba(255, 255, 255, 0.18);
padding: 2rem;
margin-top: 1rem;
margin-bottom: 2rem;
}
/* Fancy Header Font */
h1 {
font-family: 'Orbitron', sans-serif !important;
font-size: 3rem !important;
text-align: center;
background: -webkit-linear-gradient(#fff, #f0f0f0);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 0.5rem;
}
/* Stylish buttons */
div.stButton > button:first-child {
background: linear-gradient(90deg, #ff007f 0%, #7928ca 100%);
color: white;
border: none;
border-radius: 50px;
padding: 10px 24px;
font-weight: 600;
font-size: 1.1rem;
cursor: pointer;
transition: all 0.3s ease;
box-shadow: 0 4px 15px rgba(0,0,0,0.2);
}
div.stButton > button:first-child:hover {
transform: translateY(-2px);
box-shadow: 0 6px 20px rgba(0,0,0,0.3);
background: linear-gradient(90deg, #7928ca 0%, #ff007f 100%);
color: #ffffff !important;
}
/* File Uploader styling */
.stFileUploader > div > div {
background: rgba(255, 255, 255, 0.05);
border: 2px dashed rgba(255, 255, 255, 0.5);
border-radius: 10px;
}
/* Text area styling */
.stTextArea textarea {
background-color: rgba(255, 255, 255, 0.9) !important;
color: #333333 !important;
font-size: 1.5rem !important;
font-weight: 600 !important;
font-family: 'Poppins', sans-serif !important;
border-radius: 10px !important;
border: 2px solid transparent !important;
}
.stTextArea textarea:focus {
border-color: #ff007f !important;
box-shadow: 0 0 10px rgba(255,0,127,0.5) !important;
}
</style>
<div class="glass-container">
<h1>✨ Handwritten Text Recognition System for Document Digitalization ✨</h1>
<p style="text-align: center; font-size: 1.2rem;">..H..T..R..</p>
</div>
""", unsafe_allow_html=True)
from pipeline.preprocessor import DocumentPreprocessor
from pipeline.ocr_engine import HTREngine
from pipeline.postprocessor import NLPCorrector
# Initialise translator once
translator = Translator()
# Simple helpers for state
if "extracted_text" not in st.session_state:
st.session_state["extracted_text"] = ""
if "translated_text" not in st.session_state:
st.session_state["translated_text"] = ""
if "target_lang" not in st.session_state:
st.session_state["target_lang"] = "en"
@st.cache_resource(show_spinner="Booting up 3-Step HTR Pipeline (CV + OCR + NLP)...")
def load_pipeline():
p = DocumentPreprocessor()
e = HTREngine(languages=['en'])
n = NLPCorrector(use_ml=True)
return p, e, n
preprocessor, engine, nlp_corrector = load_pipeline()
col1, col2 = st.columns(2)
target_image = None
with col1:
st.markdown("### πŸ“Έ Please Upload an Image")
input_method = st.radio("Choose Input Method", ["Upload Image", "Take a Photo"], horizontal=True)
if input_method == "Upload Image":
uploaded_file = st.file_uploader("Upload a handwritten word image", type=["png", "jpg", "jpeg", "heic", "webp"])
if uploaded_file is not None:
raw_image = Image.open(uploaded_file).convert("RGB")
# Resize image to a standard width so both cropper and st.image match in size
target_width = 700
if raw_image.width != target_width:
ratio = target_width / float(raw_image.width)
raw_image = raw_image.resize((target_width, int(raw_image.height * ratio)))
if st.checkbox("✨ Crop Image", key="crop_upload"):
st.markdown("✨ **Crop the word below:**")
target_image = st_cropper(raw_image, realtime_update=True, box_color='#ff007f', key="upload_crop")
else:
target_image = raw_image
st.image(target_image, caption="Uploaded Image")
else:
camera_photo = st.camera_input("Take a picture of a handwritten word")
if camera_photo is not None:
raw_image = Image.open(camera_photo).convert("RGB")
# Resize image to a standard width so both cropper and st.image match in size
target_width = 700
if raw_image.width != target_width:
ratio = target_width / float(raw_image.width)
raw_image = raw_image.resize((target_width, int(raw_image.height * ratio)))
if st.checkbox("✨ Crop Image", key="crop_camera"):
st.markdown("✨ **Crop the word below:**")
target_image = st_cropper(raw_image, realtime_update=True, box_color='#ff007f', key="camera_crop")
else:
target_image = raw_image
st.image(target_image, caption="Captured Image")
with col2:
st.markdown("### πŸͺ„ Magic Result")
extracted_text = st.session_state.get("extracted_text", "")
translated_text = st.session_state.get("translated_text", "")
if target_image is not None:
if st.button("✨ Extract Text"):
with st.spinner("Please wait while extracting"):
if engine is None:
st.error("Pipeline failed to initialize.")
else:
# --- STREAM A: RAW OCR (No Preprocessing) ---
try:
raw_ocr_output = engine.extract_text(np.array(target_image))
raw_stream_text = nlp_corrector.correct_spelling(raw_ocr_output)
except Exception:
raw_stream_text = ""
# --- STREAM B: 3-STEP PIPELINE (Pre-Processed) ---
try:
# 1. Computer Vision Pre-Processing
cleaned_image_array = preprocessor.process(target_image)
# 2. Deep Learning OCR Engine
p_ocr_output = engine.extract_text(cleaned_image_array)
# 3. NLP Post-Processing
clean_stream_text = nlp_corrector.correct_spelling(p_ocr_output)
except Exception:
clean_stream_text = ""
# --- THE ENSEMBLE JUDGE ---
# The judge picks the version that sounds most like real English
extracted_text = nlp_corrector.judge_best_output(raw_stream_text, clean_stream_text)
if extracted_text.strip() == "":
st.warning("Oops! I couldn't find any text. Try a clearer image.")
extracted_text = ""
else:
st.success("Ensemble Magic! Winner selected from Dual-Stream analysis.")
with st.expander(""):
st.write(f"**Stream A (Raw Image):** {raw_stream_text}")
st.write(f"**Stream B (Cleaned Image):** {clean_stream_text}")
st.session_state["extracted_text"] = extracted_text
st.session_state["translated_text"] = ""
# Editable original text
st.session_state["extracted_text"] = st.text_area(
"You can edit the result here:",
value=st.session_state.get("extracted_text", ""),
height=150,
)
st.markdown("### 🌐 Translation & Voice")
# Language selection
lang_keys = sorted(LANGUAGES.keys())
default_index = lang_keys.index(st.session_state.get("target_lang", "en"))
target_lang = st.selectbox(
"Choose target language",
options=lang_keys,
index=default_index,
format_func=lambda k: LANGUAGES[k].title(),
)
st.session_state["target_lang"] = target_lang
with st.expander("Show available languages"):
st.write(", ".join(f"{code} – {name.title()}" for code, name in LANGUAGES.items()))
col_translate, col_speak = st.columns(2)
with col_translate:
if st.button("🌍 Translate into other language"):
if st.session_state["extracted_text"].strip():
try:
result = translator.translate(
st.session_state["extracted_text"],
dest=target_lang,
)
st.session_state["translated_text"] = result.text
except Exception as e:
st.error(f"Translation failed: {e}")
else:
st.warning("Please extract or type some text first.")
with col_speak:
if st.button("πŸ”Š Speak text (original & translated)"):
original = st.session_state.get("extracted_text", "").strip()
translated = st.session_state.get("translated_text", "").strip()
if not original and not translated:
st.warning("Nothing to speak. Please extract or translate text first.")
else:
# Speak original (English assumed)
if original:
try:
buf = BytesIO()
gTTS(text=original, lang="en").write_to_fp(buf)
buf.seek(0)
st.audio(buf.read(), format="audio/mp3")
except Exception as e:
st.error(f"Failed to generate audio for original text: {e}")
# Speak translated
if translated:
try:
buf_tr = BytesIO()
gTTS(text=translated, lang=target_lang).write_to_fp(buf_tr)
buf_tr.seek(0)
st.audio(buf_tr.read(), format="audio/mp3")
except Exception as e:
st.error(f"Failed to generate audio for translated text: {e}")
if st.session_state.get("translated_text", "").strip():
st.text_area(
"Translated text:",
value=st.session_state["translated_text"],
height=150,
)
else:
st.info("Waiting for an image to work my magic...")