MariaOls commited on
Commit
115ec55
·
1 Parent(s): e954f40

russian input

Browse files
Files changed (1) hide show
  1. app.py +23 -21
app.py CHANGED
@@ -19,6 +19,12 @@ from huggingface_hub import hf_hub_download
19
  from transformers import (
20
  AutoTokenizer, AutoModelForSequenceClassification, pipeline
21
  )
 
 
 
 
 
 
22
 
23
  MODEL_ID = "MariaOls/DiMa_new"
24
  THRESHOLD = 0.5 # probability threshold for 'dm'
@@ -109,24 +115,26 @@ def classify_marked_batch(marked_texts: List[str]) -> List[float]:
109
  return [float(p) for p in probs]
110
 
111
  # -------------------- Core pipeline --------------------
112
- def run_pipeline(english_text: str) -> tuple[str, str, str]:
113
  """
114
- Returns:
115
- - pretty_result: '✨ candidate1, candidate2' or 'no DMs found'
116
- - ru_text: translated text (for display)
117
- - debug_info: optional info string (can hide/show)
118
  """
119
- if not english_text or not english_text.strip():
120
  return "no input", "", ""
121
 
122
- # 1) translate
123
- ru_text = translator(english_text.strip())[0]["translation_text"].strip()
 
 
 
124
 
125
  # 2) sentence split
126
  sents = split_sentences(ru_text)
127
 
128
  # 3) detect & classify
129
- marked, mapping = [], [] # mapping: (sent_idx, span_text)
130
  for si, sent in enumerate(sents):
131
  spans = detect_candidates_ci(sent, GAZ)
132
  for (st, en, span) in spans:
@@ -139,17 +147,11 @@ def run_pipeline(english_text: str) -> tuple[str, str, str]:
139
  if p >= THRESHOLD:
140
  dm_candidates.append(span)
141
 
142
- # unique, preserve order
143
  seen = set()
144
  dm_candidates = [x for x in dm_candidates if not (x in seen or seen.add(x))]
145
 
146
- if dm_candidates:
147
- # “extra cuqui” ✨
148
- pretty = "✨ " + " · ".join(dm_candidates)
149
- else:
150
- pretty = "no DMs found"
151
-
152
- # Optional debug: show RU + count
153
  info = f"RU: {ru_text}\nDMs: {len(dm_candidates)}"
154
  return pretty, ru_text, info
155
 
@@ -159,10 +161,10 @@ with gr.Blocks(theme=gr.themes.Soft(), css="""
159
  .small note {opacity:.8}
160
  """) as demo:
161
  gr.Markdown("<h1 id='title'>DiMa_new — Discourse Marker Demo 🇷🇺✨</h1>")
162
- gr.Markdown("Type an English sentence, we’ll translate to Russian, scan for candidates, and only show those judged as DM.")
163
 
164
  with gr.Row():
165
- inp = gr.Textbox(label="English input", placeholder="e.g., In fact, we should probably leave now.", lines=3)
166
  with gr.Row():
167
  btn = gr.Button("Check ✨", variant="primary")
168
  with gr.Row():
@@ -174,9 +176,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css="""
174
 
175
  examples = gr.Examples(
176
  examples=[
177
- ["In fact, we should probably leave now."],
178
  ["Well, I think it's better to wait."],
179
- ["To be honest, this isn't ideal."],
180
  ["He said that, apparently, they cancelled it."]
181
  ],
182
  inputs=[inp]
 
19
  from transformers import (
20
  AutoTokenizer, AutoModelForSequenceClassification, pipeline
21
  )
22
+ import re
23
+
24
+ CYRILLIC_RE = re.compile(r"[А-Яа-яЁё]")
25
+
26
+ def is_russian(text: str) -> bool:
27
+ return bool(CYRILLIC_RE.search(text or ""))
28
 
29
  MODEL_ID = "MariaOls/DiMa_new"
30
  THRESHOLD = 0.5 # probability threshold for 'dm'
 
115
  return [float(p) for p in probs]
116
 
117
  # -------------------- Core pipeline --------------------
118
+ def run_pipeline(user_text: str) -> tuple[str, str, str]:
119
  """
120
+ Acepta inglés o ruso.
121
+ - Si detecta cirílico, toma el texto tal cual (ruso).
122
+ - En otro caso, traduce de EN->RU.
 
123
  """
124
+ if not user_text or not user_text.strip():
125
  return "no input", "", ""
126
 
127
+ # 1) ruso directo o traducción desde inglés
128
+ if is_russian(user_text):
129
+ ru_text = user_text.strip()
130
+ else:
131
+ ru_text = translator(user_text.strip())[0]["translation_text"].strip()
132
 
133
  # 2) sentence split
134
  sents = split_sentences(ru_text)
135
 
136
  # 3) detect & classify
137
+ marked, mapping = [], []
138
  for si, sent in enumerate(sents):
139
  spans = detect_candidates_ci(sent, GAZ)
140
  for (st, en, span) in spans:
 
147
  if p >= THRESHOLD:
148
  dm_candidates.append(span)
149
 
150
+ # único, preservando orden
151
  seen = set()
152
  dm_candidates = [x for x in dm_candidates if not (x in seen or seen.add(x))]
153
 
154
+ pretty = "✨ " + " · ".join(dm_candidates) if dm_candidates else "no DMs found"
 
 
 
 
 
 
155
  info = f"RU: {ru_text}\nDMs: {len(dm_candidates)}"
156
  return pretty, ru_text, info
157
 
 
161
  .small note {opacity:.8}
162
  """) as demo:
163
  gr.Markdown("<h1 id='title'>DiMa_new — Discourse Marker Demo 🇷🇺✨</h1>")
164
+ gr.Markdown("Type an English **or Russian** sentence. If English, we translate to Russian, scan for candidates, and only show those judged as DM.")
165
 
166
  with gr.Row():
167
+ inp = gr.Textbox(label="English or Russian input", placeholder="e.g., In fact, we should probably leave now.", lines=3)
168
  with gr.Row():
169
  btn = gr.Button("Check ✨", variant="primary")
170
  with gr.Row():
 
176
 
177
  examples = gr.Examples(
178
  examples=[
179
+ ["по-моему, он не прав."],
180
  ["Well, I think it's better to wait."],
181
+ ["Кажется, он уже ушел."],
182
  ["He said that, apparently, they cancelled it."]
183
  ],
184
  inputs=[inp]