Spaces:

yammdd
/

vietnamese-text-normalization

Running

App Files Files Community

yammdd commited on 23 days ago

Commit

894961b

verified ·

1 Parent(s): 2bfbdbe

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -54

app.py CHANGED Viewed

@@ -42,10 +42,27 @@ for mode, config in MODELS_CONFIG.items():
 def get_similarity(s1, s2):
     return difflib.SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
 def smart_alignment(source_words, target_words, target_confidences):
     n = len(source_words)
     m = len(target_words)
     dp = np.zeros((n + 1, m + 1))
     for i in range(n + 1): dp[i][0] = i * -1.0
@@ -54,19 +71,30 @@ def smart_alignment(source_words, target_words, target_confidences):
     for i in range(1, n + 1):
         for j in range(1, m + 1):
             src_word = source_words[i-1]
-            tgt_word = target_words[j-1]
-            score_1_1 = dp[i-1][j-1] + (1.0 if src_word == tgt_word else (get_similarity(src_word, tgt_word) - 0.2))
-            score_delete = dp[i-1][j] - 0.5
             score_insert = dp[i][j-1] - 0.5
-            best_score = max(score_1_1, score_delete, score_insert)
-            if j > 1:
-                combined_tgt = target_words[j-2] + " " + target_words[j-1]
-                score_1_2 = dp[i-1][j-2] + (get_similarity(src_word, combined_tgt) + 0.1)
-                best_score = max(best_score, score_1_2)
             dp[i][j] = best_score
@@ -75,47 +103,51 @@ def smart_alignment(source_words, target_words, target_confidences):
     while i > 0 or j > 0:
         src_word = source_words[i-1] if i > 0 else ""
-        tgt_word = target_words[j-1] if j > 0 else ""
         current_score = dp[i][j]
-        match_score = -999
         if i > 0 and j > 0:
-            base_sim = 1.0 if src_word == tgt_word else (get_similarity(src_word, tgt_word) - 0.2)
-            match_score = dp[i-1][j-1] + base_sim
         del_score = dp[i-1][j] - 0.5 if i > 0 else -999
-        ins_score = dp[i][j-1] - 0.5 if j > 0 else -999
-        score_1_2 = -999
-        if i > 0 and j > 1:
-            combined_tgt = target_words[j-2] + " " + target_words[j-1]
-            score_1_2 = dp[i-1][j-2] + (get_similarity(src_word, combined_tgt) + 0.1)
-        if i > 0 and j > 1 and abs(current_score - score_1_2) < 0.001:
-            full_tgt = target_words[j-2] + " " + target_words[j-1]
-            confs = target_confidences[j-2:j]
-            avg_conf = sum(confs)/len(confs) if confs else 0.0
-            aligned_results.append({
-                "original": src_word,
-                "corrected": full_tgt,
-                "confidence": avg_conf * 100,
-                "type": "replace"
-            })
-            i -= 1
-            j -= 2
-        elif i > 0 and j > 0 and abs(current_score - match_score) < 0.001:
-            tag = 'equal' if src_word == tgt_word else 'replace'
-            conf = target_confidences[j-1]
-            aligned_results.append({
-                "original": src_word,
-                "corrected": tgt_word,
-                "confidence": conf * 100,
-                "type": tag
-            })
-            i -= 1
-            j -= 1
-        elif i > 0 and abs(current_score - del_score) < 0.001:
             aligned_results.append({
                 "original": src_word,
                 "corrected": "",
@@ -123,15 +155,17 @@ def smart_alignment(source_words, target_words, target_confidences):
                 "type": "delete"
             })
             i -= 1
-        else:
-            conf = target_confidences[j-1]
-            aligned_results.append({
-                "original": "",
-                "corrected": tgt_word,
-                "confidence": conf * 100,
-                "type": "insert"
-            })
-            j -= 1
     aligned_results.reverse()
     return aligned_results

 def get_similarity(s1, s2):
     return difflib.SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
+def is_start_char_match(src, tgt):
+    if not src or not tgt: return False
+    c1 = src[0].lower()
+    c2 = tgt[0].lower()
+    if c1 == c2: return True
+    if c1 == 'f' and tgt.lower().startswith('ph'): return True
+    if c1 == 'w' and (tgt.lower().startswith('qu') or c2 == 'ư'): return True
+    if c1 == 'j' and (tgt.lower().startswith('gi') or c2 == 'd'): return True
+    if c1 == 'z' and c2 in ['d', 'r', 'v']: return True
+    if c1 == 'k' and c2 in ['c', 'q']: return True
+    return False
 def smart_alignment(source_words, target_words, target_confidences):
     n = len(source_words)
     m = len(target_words)
+    MAX_LOOKBACK = 5
     dp = np.zeros((n + 1, m + 1))
     for i in range(n + 1): dp[i][0] = i * -1.0
     for i in range(1, n + 1):
         for j in range(1, m + 1):
             src_word = source_words[i-1]
+            best_score = dp[i-1][j] - 0.5
             score_insert = dp[i][j-1] - 0.5
+            best_score = max(best_score, score_insert)
+            for k in range(1, min(j, MAX_LOOKBACK) + 1):
+                segment_words = target_words[j-k : j]
+                combined_tgt = " ".join(segment_words)
+                sim = get_similarity(src_word, combined_tgt)
+                group_bonus = 0.15 * k if k > 1 else 0
+                start_char_bonus = 0.0
+                if is_start_char_match(src_word, combined_tgt):
+                    start_char_bonus = 0.5
+                match_score = dp[i-1][j-k] + sim + group_bonus + start_char_bonus - 0.2
+                if src_word.lower() == combined_tgt.lower():
+                    match_score = dp[i-1][j-k] + 2.0
+                best_score = max(best_score, match_score)
             dp[i][j] = best_score
     while i > 0 or j > 0:
         src_word = source_words[i-1] if i > 0 else ""
         current_score = dp[i][j]
+        found_match = False
+        max_k_check = min(j, MAX_LOOKBACK)
         if i > 0 and j > 0:
+            for k in range(max_k_check, 0, -1):
+                prev_score = dp[i-1][j-k]
+                segment_words = target_words[j-k : j]
+                combined_tgt = " ".join(segment_words)
+                sim = get_similarity(src_word, combined_tgt)
+                group_bonus = 0.15 * k if k > 1 else 0
+                start_char_bonus = 0.0
+                if is_start_char_match(src_word, combined_tgt):
+                    start_char_bonus = 0.5
+                match_score = prev_score + sim + group_bonus + start_char_bonus - 0.2
+                if src_word.lower() == combined_tgt.lower():
+                    match_score = prev_score + 2.0
+                if abs(current_score - match_score) < 0.001:
+                    confs = target_confidences[j-k : j]
+                    avg_conf = sum(confs)/len(confs) if confs else 0.0
+                    type_tag = 'equal' if (k == 1 and src_word.lower() == combined_tgt.lower()) else 'replace'
+                    aligned_results.append({
+                        "original": src_word,
+                        "corrected": combined_tgt,
+                        "confidence": avg_conf * 100,
+                        "type": type_tag
+                    })
+                    i -= 1
+                    j -= k
+                    found_match = True
+                    break
+        if found_match:
+            continue
         del_score = dp[i-1][j] - 0.5 if i > 0 else -999
+        if i > 0 and abs(current_score - del_score) < 0.001:
             aligned_results.append({
                 "original": src_word,
                 "corrected": "",
                 "type": "delete"
             })
             i -= 1
+            continue
+        tgt_word = target_words[j-1] if j > 0 else ""
+        conf = target_confidences[j-1] if j > 0 else 0.0
+        aligned_results.append({
+            "original": "",
+            "corrected": tgt_word,
+            "confidence": conf * 100,
+            "type": "insert"
+        })
+        j -= 1
     aligned_results.reverse()
     return aligned_results