Spaces:

Eli-Iustus
/

Vision

Sleeping

App Files Files Community

Vision / app.py

Eli-Iustus

Update app.py

a8bb7e9 verified 17 days ago

raw

history blame contribute delete

11.8 kB

	import streamlit as st
	import torch
	import torchvision.transforms as transforms
	from PIL import Image
	from pillow_heif import register_heif_opener
	import numpy as np
	import os
	from io import BytesIO
	from googletrans import Translator, LANGUAGES
	from gtts import gTTS

	# Register HEIC support for PIL
	register_heif_opener()
	from streamlit_cropper import st_cropper
	import easyocr
	st.set_page_config(page_title="INK VISION", page_icon="✨", layout="wide")

	# Custom CSS for the stunning animated background and glassmorphic UI
	st.markdown("""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@500;700&family=Poppins:wght@300;400;600&display=swap');

	/* Animated Gradient Background */
	.stApp {
	background: linear-gradient(-45deg, #ee7752, #e73c7e, #23a6d5, #23d5ab);
	background-size: 400% 400%;
	animation: gradientBG 15s ease infinite;
	font-family: 'Poppins', sans-serif;
	}

	@keyframes gradientBG {
	0% { background-position: 0% 50%; }
	50% { background-position: 100% 50%; }
	100% { background-position: 0% 50%; }
	}

	/* Base text color to white for contrast against dark/bright backgrounds */
	h1, h2, h3, p, label {
	color: #ffffff !important;
	text-shadow: 1px 1px 4px rgba(0,0,0,0.4);
	}

	/* Glassmorphism wrapper for header */
	.glass-container {
	background: rgba(255, 255, 255, 0.1);
	border-radius: 16px;
	box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
	backdrop-filter: blur(8.5px);
	-webkit-backdrop-filter: blur(8.5px);
	border: 1px solid rgba(255, 255, 255, 0.18);
	padding: 2rem;
	margin-top: 1rem;
	margin-bottom: 2rem;
	}

	/* Fancy Header Font */
	h1 {
	font-family: 'Orbitron', sans-serif !important;
	font-size: 3rem !important;
	text-align: center;
	background: -webkit-linear-gradient(#fff, #f0f0f0);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 0.5rem;
	}

	/* Stylish buttons */
	div.stButton > button:first-child {
	background: linear-gradient(90deg, #ff007f 0%, #7928ca 100%);
	color: white;
	border: none;
	border-radius: 50px;
	padding: 10px 24px;
	font-weight: 600;
	font-size: 1.1rem;
	cursor: pointer;
	transition: all 0.3s ease;
	box-shadow: 0 4px 15px rgba(0,0,0,0.2);
	}

	div.stButton > button:first-child:hover {
	transform: translateY(-2px);
	box-shadow: 0 6px 20px rgba(0,0,0,0.3);
	background: linear-gradient(90deg, #7928ca 0%, #ff007f 100%);
	color: #ffffff !important;
	}

	/* File Uploader styling */
	.stFileUploader > div > div {
	background: rgba(255, 255, 255, 0.05);
	border: 2px dashed rgba(255, 255, 255, 0.5);
	border-radius: 10px;
	}

	/* Text area styling */
	.stTextArea textarea {
	background-color: rgba(255, 255, 255, 0.9) !important;
	color: #333333 !important;
	font-size: 1.5rem !important;
	font-weight: 600 !important;
	font-family: 'Poppins', sans-serif !important;
	border-radius: 10px !important;
	border: 2px solid transparent !important;
	}
	.stTextArea textarea:focus {
	border-color: #ff007f !important;
	box-shadow: 0 0 10px rgba(255,0,127,0.5) !important;
	}

	</style>

	<div class="glass-container">
	<h1>✨ Handwritten Text Recognition System for Document Digitalization ✨</h1>
	<p style="text-align: center; font-size: 1.2rem;">..H..T..R..</p>
	</div>
	""", unsafe_allow_html=True)

	from pipeline.preprocessor import DocumentPreprocessor
	from pipeline.ocr_engine import HTREngine
	from pipeline.postprocessor import NLPCorrector

	# Initialise translator once
	translator = Translator()

	# Simple helpers for state
	if "extracted_text" not in st.session_state:
	st.session_state["extracted_text"] = ""
	if "translated_text" not in st.session_state:
	st.session_state["translated_text"] = ""
	if "target_lang" not in st.session_state:
	st.session_state["target_lang"] = "en"

	@st.cache_resource(show_spinner="Booting up 3-Step HTR Pipeline (CV + OCR + NLP)...")
	def load_pipeline():
	p = DocumentPreprocessor()
	e = HTREngine(languages=['en'])
	n = NLPCorrector(use_ml=True)
	return p, e, n

	preprocessor, engine, nlp_corrector = load_pipeline()

	col1, col2 = st.columns(2)

	target_image = None

	with col1:
	st.markdown("### 📸 Please Upload an Image")
	input_method = st.radio("Choose Input Method", ["Upload Image", "Take a Photo"], horizontal=True)

	if input_method == "Upload Image":
	uploaded_file = st.file_uploader("Upload a handwritten word image", type=["png", "jpg", "jpeg", "heic", "webp"])
	if uploaded_file is not None:
	raw_image = Image.open(uploaded_file).convert("RGB")

	# Resize image to a standard width so both cropper and st.image match in size
	target_width = 700
	if raw_image.width != target_width:
	ratio = target_width / float(raw_image.width)
	raw_image = raw_image.resize((target_width, int(raw_image.height * ratio)))

	if st.checkbox("✨ Crop Image", key="crop_upload"):
	st.markdown("✨ Crop the word below:")
	target_image = st_cropper(raw_image, realtime_update=True, box_color='#ff007f', key="upload_crop")
	else:
	target_image = raw_image
	st.image(target_image, caption="Uploaded Image")
	else:
	camera_photo = st.camera_input("Take a picture of a handwritten word")
	if camera_photo is not None:
	raw_image = Image.open(camera_photo).convert("RGB")

	# Resize image to a standard width so both cropper and st.image match in size
	target_width = 700
	if raw_image.width != target_width:
	ratio = target_width / float(raw_image.width)
	raw_image = raw_image.resize((target_width, int(raw_image.height * ratio)))

	if st.checkbox("✨ Crop Image", key="crop_camera"):
	st.markdown("✨ Crop the word below:")
	target_image = st_cropper(raw_image, realtime_update=True, box_color='#ff007f', key="camera_crop")
	else:
	target_image = raw_image
	st.image(target_image, caption="Captured Image")

	with col2:
	st.markdown("### 🪄 Magic Result")

	extracted_text = st.session_state.get("extracted_text", "")
	translated_text = st.session_state.get("translated_text", "")

	if target_image is not None:
	if st.button("✨ Extract Text"):
	with st.spinner("Please wait while extracting"):
	if engine is None:
	st.error("Pipeline failed to initialize.")
	else:
	# --- STREAM A: RAW OCR (No Preprocessing) ---
	try:
	raw_ocr_output = engine.extract_text(np.array(target_image))
	raw_stream_text = nlp_corrector.correct_spelling(raw_ocr_output)
	except Exception:
	raw_stream_text = ""

	# --- STREAM B: 3-STEP PIPELINE (Pre-Processed) ---
	try:
	# 1. Computer Vision Pre-Processing
	cleaned_image_array = preprocessor.process(target_image)
	# 2. Deep Learning OCR Engine
	p_ocr_output = engine.extract_text(cleaned_image_array)
	# 3. NLP Post-Processing
	clean_stream_text = nlp_corrector.correct_spelling(p_ocr_output)
	except Exception:
	clean_stream_text = ""

	# --- THE ENSEMBLE JUDGE ---
	# The judge picks the version that sounds most like real English
	extracted_text = nlp_corrector.judge_best_output(raw_stream_text, clean_stream_text)

	if extracted_text.strip() == "":
	st.warning("Oops! I couldn't find any text. Try a clearer image.")
	extracted_text = ""
	else:
	st.success("Ensemble Magic! Winner selected from Dual-Stream analysis.")
	with st.expander(""):
	st.write(f"Stream A (Raw Image): {raw_stream_text}")
	st.write(f"Stream B (Cleaned Image): {clean_stream_text}")

	st.session_state["extracted_text"] = extracted_text
	st.session_state["translated_text"] = ""

	# Editable original text
	st.session_state["extracted_text"] = st.text_area(
	"You can edit the result here:",
	value=st.session_state.get("extracted_text", ""),
	height=150,
	)

	st.markdown("### 🌐 Translation & Voice")

	# Language selection
	lang_keys = sorted(LANGUAGES.keys())
	default_index = lang_keys.index(st.session_state.get("target_lang", "en"))
	target_lang = st.selectbox(
	"Choose target language",
	options=lang_keys,
	index=default_index,
	format_func=lambda k: LANGUAGES[k].title(),
	)
	st.session_state["target_lang"] = target_lang

	with st.expander("Show available languages"):
	st.write(", ".join(f"{code} – {name.title()}" for code, name in LANGUAGES.items()))

	col_translate, col_speak = st.columns(2)

	with col_translate:
	if st.button("🌍 Translate into other language"):
	if st.session_state["extracted_text"].strip():
	try:
	result = translator.translate(
	st.session_state["extracted_text"],
	dest=target_lang,
	)
	st.session_state["translated_text"] = result.text
	except Exception as e:
	st.error(f"Translation failed: {e}")
	else:
	st.warning("Please extract or type some text first.")

	with col_speak:
	if st.button("🔊 Speak text (original & translated)"):
	original = st.session_state.get("extracted_text", "").strip()
	translated = st.session_state.get("translated_text", "").strip()

	if not original and not translated:
	st.warning("Nothing to speak. Please extract or translate text first.")
	else:
	# Speak original (English assumed)
	if original:
	try:
	buf = BytesIO()
	gTTS(text=original, lang="en").write_to_fp(buf)
	buf.seek(0)
	st.audio(buf.read(), format="audio/mp3")
	except Exception as e:
	st.error(f"Failed to generate audio for original text: {e}")

	# Speak translated
	if translated:
	try:
	buf_tr = BytesIO()
	gTTS(text=translated, lang=target_lang).write_to_fp(buf_tr)
	buf_tr.seek(0)
	st.audio(buf_tr.read(), format="audio/mp3")
	except Exception as e:
	st.error(f"Failed to generate audio for translated text: {e}")

	if st.session_state.get("translated_text", "").strip():
	st.text_area(
	"Translated text:",
	value=st.session_state["translated_text"],
	height=150,
	)

	else:
	st.info("Waiting for an image to work my magic...")