Spaces:

songhieng
/

khmer-mt5-summarization-interface

Sleeping

App Files Files Community

songhieng commited on Apr 30, 2025

Commit

26f89cf

verified ·

1 Parent(s): 9599706

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -25

app.py CHANGED Viewed

@@ -1,10 +1,19 @@
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# 1. Model identifier
 MODEL_ID = "songhieng/khmer-mt5-summarization"
-# 2. Load tokenizer (you can choose fast or slow; fast is the default)
 @st.cache_resource
 def load_tokenizer_and_model(model_id):
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
@@ -13,33 +22,20 @@ def load_tokenizer_and_model(model_id):
 tokenizer, model = load_tokenizer_and_model(MODEL_ID)
-# 3. Streamlit page config
-st.set_page_config(
-    page_title="Khmer Text Summarization",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
 # 4. App header
 st.title("📝 Khmer Text Summarization")
 st.write("Paste your Khmer text below and click **Summarize** to get a concise summary.")
 # 5. Sidebar summarization settings
 st.sidebar.header("Summarization Settings")
-max_length = st.sidebar.slider(
-    "Maximum summary length", 50, 300, 150, step=10
-)
-min_length = st.sidebar.slider(
-    "Minimum summary length", 10, 100, 30, step=5
-)
-num_beams = st.sidebar.slider(
-    "Beam search width", 1, 10, 4, step=1
-)
 # 6. Text input
 user_input = st.text_area(
-    "Enter Khmer text here…",
-    height=300,
     placeholder="សូមវាយអត្ថបទខ្មែរនៅទីនេះ…"
 )
@@ -49,14 +45,14 @@ if st.button("Summarize"):
         st.warning("⚠️ Please enter some text to summarize.")
     else:
         with st.spinner("Generating summary…"):
-            # Tokenize
             inputs = tokenizer(
                 user_input,
                 return_tensors="pt",
                 truncation=True,
                 padding="longest"
             )
-            # Generate
             summary_ids = model.generate(
                 **inputs,
                 max_length=max_length,
@@ -65,11 +61,10 @@ if st.button("Summarize"):
                 length_penalty=2.0,
                 early_stopping=True
             )
-            # Decode
             summary = tokenizer.decode(
-                summary_ids[0],
                 skip_special_tokens=True
             )
-        # Display
         st.subheader("🔖 Summary:")
         st.write(summary)

 import streamlit as st
+# 1. Streamlit page config MUST be the first Streamlit command
+st.set_page_config(
+    page_title="Khmer Text Summarization",
+    page_icon="📝",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# 2. Model identifier
 MODEL_ID = "songhieng/khmer-mt5-summarization"
+# 3. Load tokenizer & model, cached to avoid reloading every run
 @st.cache_resource
 def load_tokenizer_and_model(model_id):
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
 tokenizer, model = load_tokenizer_and_model(MODEL_ID)
 # 4. App header
 st.title("📝 Khmer Text Summarization")
 st.write("Paste your Khmer text below and click **Summarize** to get a concise summary.")
 # 5. Sidebar summarization settings
 st.sidebar.header("Summarization Settings")
+max_length = st.sidebar.slider("Maximum summary length", 50, 300, 150, step=10)
+min_length = st.sidebar.slider("Minimum summary length", 10, 100, 30, step=5)
+num_beams = st.sidebar.slider("Beam search width", 1, 10, 4, step=1)
 # 6. Text input
 user_input = st.text_area(
+    "Enter Khmer text here…",
+    height=300,
     placeholder="សូមវាយអត្ថបទខ្មែរនៅទីនេះ…"
 )
         st.warning("⚠️ Please enter some text to summarize.")
     else:
         with st.spinner("Generating summary…"):
+            # Tokenize the input text
             inputs = tokenizer(
                 user_input,
                 return_tensors="pt",
                 truncation=True,
                 padding="longest"
             )
+            # Generate the summary
             summary_ids = model.generate(
                 **inputs,
                 max_length=max_length,
                 length_penalty=2.0,
                 early_stopping=True
             )
+            # Decode and display
             summary = tokenizer.decode(
+                summary_ids[0],
                 skip_special_tokens=True
             )
         st.subheader("🔖 Summary:")
         st.write(summary)