Spaces:

MariaOls
/

DiMa_DeMo

Sleeping

App Files Files Community

MariaOls commited on Oct 13, 2025

Commit

115ec55

1 Parent(s): e954f40

russian input

Browse files

Files changed (1) hide show

app.py +23 -21

app.py CHANGED Viewed

@@ -19,6 +19,12 @@ from huggingface_hub import hf_hub_download
 from transformers import (
     AutoTokenizer, AutoModelForSequenceClassification, pipeline
 )
 MODEL_ID = "MariaOls/DiMa_new"
 THRESHOLD = 0.5  # probability threshold for 'dm'
@@ -109,24 +115,26 @@ def classify_marked_batch(marked_texts: List[str]) -> List[float]:
     return [float(p) for p in probs]
 # -------------------- Core pipeline --------------------
-def run_pipeline(english_text: str) -> tuple[str, str, str]:
     """
-    Returns:
-      - pretty_result: '✨ candidate1, candidate2' or 'no DMs found'
-      - ru_text: translated text (for display)
-      - debug_info: optional info string (can hide/show)
     """
-    if not english_text or not english_text.strip():
         return "no input", "", ""
-    # 1) translate
-    ru_text = translator(english_text.strip())[0]["translation_text"].strip()
     # 2) sentence split
     sents = split_sentences(ru_text)
     # 3) detect & classify
-    marked, mapping = [], []  # mapping: (sent_idx, span_text)
     for si, sent in enumerate(sents):
         spans = detect_candidates_ci(sent, GAZ)
         for (st, en, span) in spans:
@@ -139,17 +147,11 @@ def run_pipeline(english_text: str) -> tuple[str, str, str]:
         if p >= THRESHOLD:
             dm_candidates.append(span)
-    # unique, preserve order
     seen = set()
     dm_candidates = [x for x in dm_candidates if not (x in seen or seen.add(x))]
-    if dm_candidates:
-        # “extra cuqui” ✨
-        pretty = "✨ " + " · ".join(dm_candidates)
-    else:
-        pretty = "no DMs found"
-    # Optional debug: show RU + count
     info = f"RU: {ru_text}\nDMs: {len(dm_candidates)}"
     return pretty, ru_text, info
@@ -159,10 +161,10 @@ with gr.Blocks(theme=gr.themes.Soft(), css="""
 .small note {opacity:.8}
 """) as demo:
     gr.Markdown("<h1 id='title'>DiMa_new — Discourse Marker Demo 🇷🇺✨</h1>")
-    gr.Markdown("Type an English sentence, we’ll translate to Russian, scan for candidates, and only show those judged as DM.")
     with gr.Row():
-        inp = gr.Textbox(label="English input", placeholder="e.g., In fact, we should probably leave now.", lines=3)
     with gr.Row():
         btn = gr.Button("Check ✨", variant="primary")
     with gr.Row():
@@ -174,9 +176,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css="""
     examples = gr.Examples(
         examples=[
-            ["In fact, we should probably leave now."],
             ["Well, I think it's better to wait."],
-            ["To be honest, this isn't ideal."],
             ["He said that, apparently, they cancelled it."]
         ],
         inputs=[inp]

 from transformers import (
     AutoTokenizer, AutoModelForSequenceClassification, pipeline
 )
+import re
+CYRILLIC_RE = re.compile(r"[А-Яа-яЁё]")
+def is_russian(text: str) -> bool:
+    return bool(CYRILLIC_RE.search(text or ""))
 MODEL_ID = "MariaOls/DiMa_new"
 THRESHOLD = 0.5  # probability threshold for 'dm'
     return [float(p) for p in probs]
 # -------------------- Core pipeline --------------------
+def run_pipeline(user_text: str) -> tuple[str, str, str]:
     """
+    Acepta inglés o ruso.
+    - Si detecta cirílico, toma el texto tal cual (ruso).
+    - En otro caso, traduce de EN->RU.
     """
+    if not user_text or not user_text.strip():
         return "no input", "", ""
+    # 1) ruso directo o traducción desde inglés
+    if is_russian(user_text):
+        ru_text = user_text.strip()
+    else:
+        ru_text = translator(user_text.strip())[0]["translation_text"].strip()
     # 2) sentence split
     sents = split_sentences(ru_text)
     # 3) detect & classify
+    marked, mapping = [], []
     for si, sent in enumerate(sents):
         spans = detect_candidates_ci(sent, GAZ)
         for (st, en, span) in spans:
         if p >= THRESHOLD:
             dm_candidates.append(span)
+    # único, preservando orden
     seen = set()
     dm_candidates = [x for x in dm_candidates if not (x in seen or seen.add(x))]
+    pretty = "✨ " + " · ".join(dm_candidates) if dm_candidates else "no DMs found"
     info = f"RU: {ru_text}\nDMs: {len(dm_candidates)}"
     return pretty, ru_text, info
 .small note {opacity:.8}
 """) as demo:
     gr.Markdown("<h1 id='title'>DiMa_new — Discourse Marker Demo 🇷🇺✨</h1>")
+    gr.Markdown("Type an English **or Russian** sentence. If English, we translate to Russian, scan for candidates, and only show those judged as DM.")
     with gr.Row():
+        inp = gr.Textbox(label="English or Russian input", placeholder="e.g., In fact, we should probably leave now.", lines=3)
     with gr.Row():
         btn = gr.Button("Check ✨", variant="primary")
     with gr.Row():
     examples = gr.Examples(
         examples=[
+            ["по-моему, он не прав."],
             ["Well, I think it's better to wait."],
+            ["Кажется, он уже ушел."],
             ["He said that, apparently, they cancelled it."]
         ],
         inputs=[inp]