tagin / app.py
repleeka's picture
Update app.py
a25d063 verified
# import streamlit as st
# import pandas as pd
# import time
# from datetime import datetime
# import os
# import json
# # Try to import Google Sheets libraries
# try:
# import gspread
# from oauth2client.service_account import ServiceAccountCredentials
# GSHEETS_AVAILABLE = True
# except ImportError:
# GSHEETS_AVAILABLE = False
# # --- CONFIGURATION ---
# st.set_page_config(
# page_title="Tagin Feedback Loop",
# page_icon="๐Ÿ“",
# layout="centered",
# initial_sidebar_state="expanded"
# )
# # --- SESSION STATE INITIALIZATION ---
# if "translations_list" not in st.session_state:
# st.session_state.translations_list = []
# if "source_text" not in st.session_state:
# st.session_state.source_text = ""
# # --- MODEL LOADING LOGIC ---
# @st.cache_resource
# def load_model(model_path):
# """
# Loads the mBART model and tokenizer from a local directory or HuggingFace Hub.
# """
# try:
# from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
# import torch
# tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
# model = MBartForConditionalGeneration.from_pretrained(model_path)
# return tokenizer, model, None
# except Exception as e:
# return None, None, str(e)
# def perform_translation_beam(text, source_lang, target_lang, model, tokenizer, num_beams=5):
# """
# Translates text returning top N hypotheses using beam search.
# """
# if not text:
# return []
# lang_map = {
# "English": "en_XX",
# "Tagin": "<tgj_IN>"
# }
# src_code = lang_map.get(source_lang)
# tgt_code = lang_map.get(target_lang)
# try:
# tokenizer.src_lang = src_code
# encoded_input = tokenizer(text, return_tensors="pt")
# if tgt_code in tokenizer.lang_code_to_id:
# forced_bos_id = tokenizer.lang_code_to_id[tgt_code]
# else:
# forced_bos_id = tokenizer.convert_tokens_to_ids(tgt_code)
# generated_tokens = model.generate(
# **encoded_input,
# forced_bos_token_id=forced_bos_id,
# num_beams=num_beams,
# num_return_sequences=num_beams,
# max_length=128
# )
# translations = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# return translations
# except Exception as e:
# st.error(f"Translation Error: {str(e)}")
# return []
# def save_to_gsheet(data_row, creds_dict, sheet_name):
# """Saves data to Google Sheets."""
# if not GSHEETS_AVAILABLE:
# return False, "Libraries 'gspread' and 'oauth2client' not installed."
# try:
# # Define scope
# scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
# # Authenticate using the dictionary (from secrets or file)
# creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
# client = gspread.authorize(creds)
# # Open the sheet
# try:
# sheet = client.open(sheet_name).sheet1
# except gspread.SpreadsheetNotFound:
# return False, f"Spreadsheet '{sheet_name}' not found. Please share it with the service account email."
# # Check if header exists (simple check: is cell A1 empty?)
# if not sheet.cell(1, 1).value:
# sheet.append_row(['timestamp', 'source_lang', 'target_lang', 'source_text', 'corrected_translation'])
# # Append data
# sheet.append_row(data_row)
# return True, f"Saved to Google Sheet '{sheet_name}'"
# except Exception as e:
# return False, str(e)
# # --- SIDEBAR ---
# with st.sidebar:
# st.title("โš™๏ธ Configuration")
# # Fixed Model Path
# st.markdown("**Active Model:**")
# model_path_input = "Repleeka/mBART-tgj-final"
# st.code(model_path_input, language=None)
# st.divider()
# st.subheader("๐Ÿ’พ Storage Settings")
# st.caption("All corrections are saved to Google Sheets.")
# gsheet_creds = None
# # Check dependencies
# if not GSHEETS_AVAILABLE:
# st.error("โš ๏ธ Install gspread: `pip install gspread oauth2client`")
# # Fixed Google Sheet Name
# gsheet_name = "GinLish_Corpus_BOT"
# st.markdown("**Target Database:**")
# st.info(f"๐Ÿ“„ {gsheet_name}")
# # SECURITY UPDATE: Check for Secrets first (HuggingFace Spaces / Streamlit Cloud)
# # We look for a secret named "GSHEET_CREDENTIALS" containing the JSON string
# if "GSHEET_CREDENTIALS" in os.environ:
# try:
# gsheet_creds = json.loads(os.environ["GSHEET_CREDENTIALS"])
# st.success("โœ… Credentials loaded from Environment Secrets")
# except Exception as e:
# st.error(f"Error loading secrets: {e}")
# elif "gcp_service_account" in st.secrets:
# # Support for Streamlit Cloud native secrets
# gsheet_creds = st.secrets["gcp_service_account"]
# st.success("โœ… Credentials loaded from Streamlit Secrets")
# else:
# # Fallback to file uploader for local testing
# uploaded_file = st.file_uploader("Service Account JSON", type=['json'], help="Upload for local testing. In prod, use Secrets.")
# if uploaded_file is not None:
# try:
# gsheet_creds = json.load(uploaded_file)
# st.success("Credentials loaded from file!")
# except:
# st.error("Invalid JSON file")
# st.divider()
# # --- MAIN INTERFACE ---
# st.title("โœ๏ธ English-to-Tagin Translator & Corrector")
# st.markdown("Generate multiple hypotheses, choose the best one, and save it for retraining.")
# # Load Model
# tokenizer, model, error_msg = load_model(model_path_input)
# if error_msg:
# st.error(f"โŒ Model Error: {error_msg}")
# else:
# # 1. Input Section
# st.subheader("Source Text")
# col_lang1, col_lang2 = st.columns(2)
# with col_lang1:
# source_lang = st.selectbox("Source", ["English", "Tagin"])
# with col_lang2:
# target_lang = st.selectbox("Target", ["English", "Tagin"], index=1 if source_lang=="English" else 0)
# input_text = st.text_area(
# "Input",
# height=100,
# label_visibility="collapsed",
# placeholder="Enter text to translate...",
# key="main_input"
# )
# if st.button("Translate with Beam Search ๐Ÿ”", type="primary", use_container_width=True):
# if input_text:
# with st.spinner("Generating top 5 hypotheses..."):
# results = perform_translation_beam(input_text, source_lang, target_lang, model, tokenizer)
# st.session_state.translations_list = results
# st.session_state.source_text = input_text # Lock in source text
# else:
# st.warning("Please enter some text.")
# st.divider()
# # 2. Results & Selection Section
# if st.session_state.translations_list:
# st.subheader("Select Best Translation")
# options = st.session_state.translations_list
# radio_options = [f"{i+1}. {text}" for i, text in enumerate(options)]
# selected_option_str = st.radio(
# "Top 5 Hypotheses (AI Suggestions):",
# options=radio_options,
# index=0
# )
# selected_index = radio_options.index(selected_option_str)
# final_candidate = options[selected_index]
# st.markdown("#### Review & Edit Final Output")
# st.caption("If none of the above are perfect, edit the text below before saving.")
# final_correction = st.text_area("Final Output", value=final_candidate, height=100)
# col_save, col_status = st.columns([1, 2])
# with col_save:
# if st.button("๐Ÿ’พ Save to Dataset", type="primary"):
# # Prepare Data Row
# timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# # Simplified language codes for storage
# s_code = "en_XX" if source_lang == "English" else "tgj_IN"
# t_code = "tgj_IN" if target_lang == "Tagin" else "en_XX"
# data_row = [timestamp, s_code, t_code, st.session_state.source_text, final_correction]
# # Execute Save Strategy
# success = False
# msg = ""
# if gsheet_creds and gsheet_name:
# with st.spinner("Connecting to Google Sheets..."):
# success, msg = save_to_gsheet(data_row, gsheet_creds, gsheet_name)
# else:
# msg = "Missing Credentials or Sheet Name."
# if success:
# st.success(f"Saved! ({msg})")
# time.sleep(1)
# st.rerun()
# else:
# st.error(f"Save Failed: {msg}")
# with col_status:
# st.caption(f"Saving to Google Sheet: `{gsheet_name}`")
# elif input_text:
# st.info("Hit 'Translate' to see suggestions.")
# # Create some space between main sidebar content and footer
# st.sidebar.markdown("<br>" * 5, unsafe_allow_html=True)
# st.sidebar.markdown("---")
# st.sidebar.caption("Made with โค๏ธ by Tungon Dugi")
# st.sidebar.caption("Contact: tungondugi@gmail.com")
# # Or using columns in sidebar:
# col1, col2 = st.sidebar.columns(2)
# with col1:
# st.caption("ยฉ 2026")
# with col2:
# st.caption("v0.1.1")
import streamlit as st
import pandas as pd
import time
from datetime import datetime
import os
import json
# Try to import Google Sheets libraries
try:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
GSHEETS_AVAILABLE = True
except ImportError:
GSHEETS_AVAILABLE = False
# --- CONFIGURATION ---
st.set_page_config(
page_title="Tagin Feedback Loop",
page_icon="๐Ÿ“",
layout="centered",
initial_sidebar_state="expanded"
)
# --- SESSION STATE INITIALIZATION ---
if "translations_list" not in st.session_state:
st.session_state.translations_list = []
if "source_text" not in st.session_state:
st.session_state.source_text = ""
# --- MODEL LOADING LOGIC ---
@st.cache_resource
def load_model(model_path):
"""
Loads the mBART model and tokenizer from a local directory or HuggingFace Hub.
"""
try:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)
return tokenizer, model, None
except Exception as e:
return None, None, str(e)
def perform_translation_beam(text, source_lang, target_lang, model, tokenizer, num_beams=5):
"""
Translates text returning top N hypotheses using beam search.
"""
if not text:
return []
lang_map = {
"English": "en_XX",
"Tagin": "<tgj_IN>"
}
src_code = lang_map.get(source_lang)
tgt_code = lang_map.get(target_lang)
try:
tokenizer.src_lang = src_code
encoded_input = tokenizer(text, return_tensors="pt")
if tgt_code in tokenizer.lang_code_to_id:
forced_bos_id = tokenizer.lang_code_to_id[tgt_code]
else:
forced_bos_id = tokenizer.convert_tokens_to_ids(tgt_code)
generated_tokens = model.generate(
**encoded_input,
forced_bos_token_id=forced_bos_id,
num_beams=num_beams,
num_return_sequences=num_beams,
max_length=128
)
translations = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
return translations
except Exception as e:
st.error(f"Translation Error: {str(e)}")
return []
def save_to_gsheet(data_row, creds_dict, sheet_name):
"""Saves data to Google Sheets."""
if not GSHEETS_AVAILABLE:
return False, "Libraries 'gspread' and 'oauth2client' not installed."
try:
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
client = gspread.authorize(creds)
try:
sheet = client.open(sheet_name).sheet1
except gspread.SpreadsheetNotFound:
return False, f"Spreadsheet '{sheet_name}' not found."
if not sheet.cell(1, 1).value:
sheet.append_row(['timestamp', 'source_lang', 'target_lang', 'source_text', 'corrected_translation'])
sheet.append_row(data_row)
return True, f"Saved to Google Sheet '{sheet_name}'"
except Exception as e:
return False, str(e)
# --- SIDEBAR ---
with st.sidebar:
st.title("โš™๏ธ Configuration")
st.markdown("**Active Model:**")
model_path_input = "Repleeka/mBART-tgj-final"
st.code(model_path_input, language=None)
st.divider()
st.subheader("๐Ÿ’พ Storage Settings")
gsheet_creds = None
if not GSHEETS_AVAILABLE:
st.error("โš ๏ธ Install dependencies: `pip install gspread oauth2client`")
gsheet_name = "GinLish_Corpus_BOT"
st.info(f"๐Ÿ“„ {gsheet_name}")
if "GSHEET_CREDENTIALS" in os.environ:
gsheet_creds = json.loads(os.environ["GSHEET_CREDENTIALS"])
st.success("โœ… Credentials loaded (Env)")
elif "gcp_service_account" in st.secrets:
gsheet_creds = st.secrets["gcp_service_account"]
st.success("โœ… Credentials loaded (Secrets)")
else:
uploaded_file = st.file_uploader("Service Account JSON", type=['json'])
if uploaded_file:
gsheet_creds = json.load(uploaded_file)
# --- MAIN INTERFACE ---
st.title("โœ๏ธ English-to-Tagin Translator")
tokenizer, model, error_msg = load_model(model_path_input)
if error_msg:
st.error(f"โŒ Model Error: {error_msg}")
else:
st.subheader("Source Text")
# Use a form to allow 'Enter' key submission
with st.form("translation_form", clear_on_submit=False):
col_lang1, col_lang2 = st.columns(2)
with col_lang1:
source_lang = st.selectbox("Source", ["English", "Tagin"])
with col_lang2:
target_lang = st.selectbox("Target", ["English", "Tagin"], index=1 if source_lang=="English" else 0)
input_text = st.text_area(
"Input",
height=100,
label_visibility="collapsed",
placeholder="Enter text and press Enter (or click below) to translate...",
key="main_input"
)
submit_button = st.form_submit_button("Translate with Beam Search ๐Ÿ”", type="primary", use_container_width=True)
# Process translation when button is clicked OR Enter is pressed
if submit_button:
if input_text:
# Auto-log raw input
if gsheet_creds and gsheet_name:
try:
log_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_s_code = "en_XX" if source_lang == "English" else "tgj_IN"
log_t_code = "tgj_IN" if target_lang == "Tagin" else "en_XX"
log_row = [log_timestamp, log_s_code, log_t_code, input_text, "[RAW_INPUT]"]
save_to_gsheet(log_row, gsheet_creds, gsheet_name)
except:
pass
with st.spinner("Generating hypotheses..."):
results = perform_translation_beam(input_text, source_lang, target_lang, model, tokenizer)
st.session_state.translations_list = results
st.session_state.source_text = input_text
else:
st.warning("Please enter some text.")
st.divider()
# 2. Results & Selection Section
if st.session_state.translations_list:
st.subheader("Select Best Translation")
options = st.session_state.translations_list
radio_options = [f"{i+1}. {text}" for i, text in enumerate(options)]
selected_option_str = st.radio(
"Top 5 Hypotheses:",
options=radio_options,
index=0
)
selected_index = radio_options.index(selected_option_str)
final_candidate = options[selected_index]
st.markdown("#### Review & Edit Final Output")
final_correction = st.text_area("Final Output", value=final_candidate, height=100)
col_save, col_status = st.columns([1, 2])
with col_save:
if st.button("๐Ÿ’พ Save to Dataset", type="primary"):
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
s_code = "en_XX" if source_lang == "English" else "tgj_IN"
t_code = "tgj_IN" if target_lang == "Tagin" else "en_XX"
data_row = [timestamp, s_code, t_code, st.session_state.source_text, final_correction]
if gsheet_creds and gsheet_name:
with st.spinner("Saving..."):
success, msg = save_to_gsheet(data_row, gsheet_creds, gsheet_name)
if success:
st.success("Saved!")
time.sleep(1)
st.rerun()
else:
st.error(f"Error: {msg}")
else:
st.error("Missing configuration.")
elif input_text:
st.info("Hit 'Enter' or click 'Translate' to see suggestions.")
st.sidebar.markdown("<br>" * 5, unsafe_allow_html=True)
st.sidebar.markdown("---")
st.sidebar.caption("Made with โค๏ธ by Tungon Dugi")
st.sidebar.caption("v0.1.2")