Update app.py
Browse files
app.py
CHANGED
|
@@ -42,10 +42,27 @@ for mode, config in MODELS_CONFIG.items():
|
|
| 42 |
def get_similarity(s1, s2):
|
| 43 |
return difflib.SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
def smart_alignment(source_words, target_words, target_confidences):
|
| 46 |
n = len(source_words)
|
| 47 |
m = len(target_words)
|
| 48 |
|
|
|
|
|
|
|
| 49 |
dp = np.zeros((n + 1, m + 1))
|
| 50 |
|
| 51 |
for i in range(n + 1): dp[i][0] = i * -1.0
|
|
@@ -54,19 +71,30 @@ def smart_alignment(source_words, target_words, target_confidences):
|
|
| 54 |
for i in range(1, n + 1):
|
| 55 |
for j in range(1, m + 1):
|
| 56 |
src_word = source_words[i-1]
|
| 57 |
-
tgt_word = target_words[j-1]
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
-
score_delete = dp[i-1][j] - 0.5
|
| 62 |
score_insert = dp[i][j-1] - 0.5
|
|
|
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
dp[i][j] = best_score
|
| 72 |
|
|
@@ -75,47 +103,51 @@ def smart_alignment(source_words, target_words, target_confidences):
|
|
| 75 |
|
| 76 |
while i > 0 or j > 0:
|
| 77 |
src_word = source_words[i-1] if i > 0 else ""
|
| 78 |
-
tgt_word = target_words[j-1] if j > 0 else ""
|
| 79 |
-
|
| 80 |
current_score = dp[i][j]
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
|
| 83 |
if i > 0 and j > 0:
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
|
|
|
|
|
|
|
|
|
| 87 |
del_score = dp[i-1][j] - 0.5 if i > 0 else -999
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
score_1_2 = -999
|
| 91 |
-
if i > 0 and j > 1:
|
| 92 |
-
combined_tgt = target_words[j-2] + " " + target_words[j-1]
|
| 93 |
-
score_1_2 = dp[i-1][j-2] + (get_similarity(src_word, combined_tgt) + 0.1)
|
| 94 |
-
|
| 95 |
-
if i > 0 and j > 1 and abs(current_score - score_1_2) < 0.001:
|
| 96 |
-
full_tgt = target_words[j-2] + " " + target_words[j-1]
|
| 97 |
-
confs = target_confidences[j-2:j]
|
| 98 |
-
avg_conf = sum(confs)/len(confs) if confs else 0.0
|
| 99 |
-
aligned_results.append({
|
| 100 |
-
"original": src_word,
|
| 101 |
-
"corrected": full_tgt,
|
| 102 |
-
"confidence": avg_conf * 100,
|
| 103 |
-
"type": "replace"
|
| 104 |
-
})
|
| 105 |
-
i -= 1
|
| 106 |
-
j -= 2
|
| 107 |
-
elif i > 0 and j > 0 and abs(current_score - match_score) < 0.001:
|
| 108 |
-
tag = 'equal' if src_word == tgt_word else 'replace'
|
| 109 |
-
conf = target_confidences[j-1]
|
| 110 |
-
aligned_results.append({
|
| 111 |
-
"original": src_word,
|
| 112 |
-
"corrected": tgt_word,
|
| 113 |
-
"confidence": conf * 100,
|
| 114 |
-
"type": tag
|
| 115 |
-
})
|
| 116 |
-
i -= 1
|
| 117 |
-
j -= 1
|
| 118 |
-
elif i > 0 and abs(current_score - del_score) < 0.001:
|
| 119 |
aligned_results.append({
|
| 120 |
"original": src_word,
|
| 121 |
"corrected": "",
|
|
@@ -123,15 +155,17 @@ def smart_alignment(source_words, target_words, target_confidences):
|
|
| 123 |
"type": "delete"
|
| 124 |
})
|
| 125 |
i -= 1
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
| 135 |
|
| 136 |
aligned_results.reverse()
|
| 137 |
return aligned_results
|
|
|
|
| 42 |
def get_similarity(s1, s2):
|
| 43 |
return difflib.SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
|
| 44 |
|
| 45 |
+
def is_start_char_match(src, tgt):
|
| 46 |
+
if not src or not tgt: return False
|
| 47 |
+
c1 = src[0].lower()
|
| 48 |
+
c2 = tgt[0].lower()
|
| 49 |
+
|
| 50 |
+
if c1 == c2: return True
|
| 51 |
+
|
| 52 |
+
if c1 == 'f' and tgt.lower().startswith('ph'): return True
|
| 53 |
+
if c1 == 'w' and (tgt.lower().startswith('qu') or c2 == 'ư'): return True
|
| 54 |
+
if c1 == 'j' and (tgt.lower().startswith('gi') or c2 == 'd'): return True
|
| 55 |
+
if c1 == 'z' and c2 in ['d', 'r', 'v']: return True
|
| 56 |
+
if c1 == 'k' and c2 in ['c', 'q']: return True
|
| 57 |
+
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
def smart_alignment(source_words, target_words, target_confidences):
|
| 61 |
n = len(source_words)
|
| 62 |
m = len(target_words)
|
| 63 |
|
| 64 |
+
MAX_LOOKBACK = 5
|
| 65 |
+
|
| 66 |
dp = np.zeros((n + 1, m + 1))
|
| 67 |
|
| 68 |
for i in range(n + 1): dp[i][0] = i * -1.0
|
|
|
|
| 71 |
for i in range(1, n + 1):
|
| 72 |
for j in range(1, m + 1):
|
| 73 |
src_word = source_words[i-1]
|
|
|
|
| 74 |
|
| 75 |
+
best_score = dp[i-1][j] - 0.5
|
| 76 |
|
|
|
|
| 77 |
score_insert = dp[i][j-1] - 0.5
|
| 78 |
+
best_score = max(best_score, score_insert)
|
| 79 |
|
| 80 |
+
for k in range(1, min(j, MAX_LOOKBACK) + 1):
|
| 81 |
+
segment_words = target_words[j-k : j]
|
| 82 |
+
combined_tgt = " ".join(segment_words)
|
| 83 |
+
|
| 84 |
+
sim = get_similarity(src_word, combined_tgt)
|
| 85 |
+
|
| 86 |
+
group_bonus = 0.15 * k if k > 1 else 0
|
| 87 |
+
|
| 88 |
+
start_char_bonus = 0.0
|
| 89 |
+
if is_start_char_match(src_word, combined_tgt):
|
| 90 |
+
start_char_bonus = 0.5
|
| 91 |
+
|
| 92 |
+
match_score = dp[i-1][j-k] + sim + group_bonus + start_char_bonus - 0.2
|
| 93 |
+
|
| 94 |
+
if src_word.lower() == combined_tgt.lower():
|
| 95 |
+
match_score = dp[i-1][j-k] + 2.0
|
| 96 |
+
|
| 97 |
+
best_score = max(best_score, match_score)
|
| 98 |
|
| 99 |
dp[i][j] = best_score
|
| 100 |
|
|
|
|
| 103 |
|
| 104 |
while i > 0 or j > 0:
|
| 105 |
src_word = source_words[i-1] if i > 0 else ""
|
|
|
|
|
|
|
| 106 |
current_score = dp[i][j]
|
| 107 |
|
| 108 |
+
found_match = False
|
| 109 |
+
|
| 110 |
+
max_k_check = min(j, MAX_LOOKBACK)
|
| 111 |
if i > 0 and j > 0:
|
| 112 |
+
for k in range(max_k_check, 0, -1):
|
| 113 |
+
prev_score = dp[i-1][j-k]
|
| 114 |
+
segment_words = target_words[j-k : j]
|
| 115 |
+
combined_tgt = " ".join(segment_words)
|
| 116 |
+
|
| 117 |
+
sim = get_similarity(src_word, combined_tgt)
|
| 118 |
+
group_bonus = 0.15 * k if k > 1 else 0
|
| 119 |
+
|
| 120 |
+
start_char_bonus = 0.0
|
| 121 |
+
if is_start_char_match(src_word, combined_tgt):
|
| 122 |
+
start_char_bonus = 0.5
|
| 123 |
+
|
| 124 |
+
match_score = prev_score + sim + group_bonus + start_char_bonus - 0.2
|
| 125 |
+
|
| 126 |
+
if src_word.lower() == combined_tgt.lower():
|
| 127 |
+
match_score = prev_score + 2.0
|
| 128 |
+
|
| 129 |
+
if abs(current_score - match_score) < 0.001:
|
| 130 |
+
confs = target_confidences[j-k : j]
|
| 131 |
+
avg_conf = sum(confs)/len(confs) if confs else 0.0
|
| 132 |
+
|
| 133 |
+
type_tag = 'equal' if (k == 1 and src_word.lower() == combined_tgt.lower()) else 'replace'
|
| 134 |
+
|
| 135 |
+
aligned_results.append({
|
| 136 |
+
"original": src_word,
|
| 137 |
+
"corrected": combined_tgt,
|
| 138 |
+
"confidence": avg_conf * 100,
|
| 139 |
+
"type": type_tag
|
| 140 |
+
})
|
| 141 |
+
i -= 1
|
| 142 |
+
j -= k
|
| 143 |
+
found_match = True
|
| 144 |
+
break
|
| 145 |
|
| 146 |
+
if found_match:
|
| 147 |
+
continue
|
| 148 |
+
|
| 149 |
del_score = dp[i-1][j] - 0.5 if i > 0 else -999
|
| 150 |
+
if i > 0 and abs(current_score - del_score) < 0.001:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
aligned_results.append({
|
| 152 |
"original": src_word,
|
| 153 |
"corrected": "",
|
|
|
|
| 155 |
"type": "delete"
|
| 156 |
})
|
| 157 |
i -= 1
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
tgt_word = target_words[j-1] if j > 0 else ""
|
| 161 |
+
conf = target_confidences[j-1] if j > 0 else 0.0
|
| 162 |
+
aligned_results.append({
|
| 163 |
+
"original": "",
|
| 164 |
+
"corrected": tgt_word,
|
| 165 |
+
"confidence": conf * 100,
|
| 166 |
+
"type": "insert"
|
| 167 |
+
})
|
| 168 |
+
j -= 1
|
| 169 |
|
| 170 |
aligned_results.reverse()
|
| 171 |
return aligned_results
|