yammdd commited on
Commit
894961b
·
verified ·
1 Parent(s): 2bfbdbe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -54
app.py CHANGED
@@ -42,10 +42,27 @@ for mode, config in MODELS_CONFIG.items():
42
  def get_similarity(s1, s2):
43
  return difflib.SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def smart_alignment(source_words, target_words, target_confidences):
46
  n = len(source_words)
47
  m = len(target_words)
48
 
 
 
49
  dp = np.zeros((n + 1, m + 1))
50
 
51
  for i in range(n + 1): dp[i][0] = i * -1.0
@@ -54,19 +71,30 @@ def smart_alignment(source_words, target_words, target_confidences):
54
  for i in range(1, n + 1):
55
  for j in range(1, m + 1):
56
  src_word = source_words[i-1]
57
- tgt_word = target_words[j-1]
58
 
59
- score_1_1 = dp[i-1][j-1] + (1.0 if src_word == tgt_word else (get_similarity(src_word, tgt_word) - 0.2))
60
 
61
- score_delete = dp[i-1][j] - 0.5
62
  score_insert = dp[i][j-1] - 0.5
 
63
 
64
- best_score = max(score_1_1, score_delete, score_insert)
65
-
66
- if j > 1:
67
- combined_tgt = target_words[j-2] + " " + target_words[j-1]
68
- score_1_2 = dp[i-1][j-2] + (get_similarity(src_word, combined_tgt) + 0.1)
69
- best_score = max(best_score, score_1_2)
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  dp[i][j] = best_score
72
 
@@ -75,47 +103,51 @@ def smart_alignment(source_words, target_words, target_confidences):
75
 
76
  while i > 0 or j > 0:
77
  src_word = source_words[i-1] if i > 0 else ""
78
- tgt_word = target_words[j-1] if j > 0 else ""
79
-
80
  current_score = dp[i][j]
81
 
82
- match_score = -999
 
 
83
  if i > 0 and j > 0:
84
- base_sim = 1.0 if src_word == tgt_word else (get_similarity(src_word, tgt_word) - 0.2)
85
- match_score = dp[i-1][j-1] + base_sim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
 
 
87
  del_score = dp[i-1][j] - 0.5 if i > 0 else -999
88
- ins_score = dp[i][j-1] - 0.5 if j > 0 else -999
89
-
90
- score_1_2 = -999
91
- if i > 0 and j > 1:
92
- combined_tgt = target_words[j-2] + " " + target_words[j-1]
93
- score_1_2 = dp[i-1][j-2] + (get_similarity(src_word, combined_tgt) + 0.1)
94
-
95
- if i > 0 and j > 1 and abs(current_score - score_1_2) < 0.001:
96
- full_tgt = target_words[j-2] + " " + target_words[j-1]
97
- confs = target_confidences[j-2:j]
98
- avg_conf = sum(confs)/len(confs) if confs else 0.0
99
- aligned_results.append({
100
- "original": src_word,
101
- "corrected": full_tgt,
102
- "confidence": avg_conf * 100,
103
- "type": "replace"
104
- })
105
- i -= 1
106
- j -= 2
107
- elif i > 0 and j > 0 and abs(current_score - match_score) < 0.001:
108
- tag = 'equal' if src_word == tgt_word else 'replace'
109
- conf = target_confidences[j-1]
110
- aligned_results.append({
111
- "original": src_word,
112
- "corrected": tgt_word,
113
- "confidence": conf * 100,
114
- "type": tag
115
- })
116
- i -= 1
117
- j -= 1
118
- elif i > 0 and abs(current_score - del_score) < 0.001:
119
  aligned_results.append({
120
  "original": src_word,
121
  "corrected": "",
@@ -123,15 +155,17 @@ def smart_alignment(source_words, target_words, target_confidences):
123
  "type": "delete"
124
  })
125
  i -= 1
126
- else:
127
- conf = target_confidences[j-1]
128
- aligned_results.append({
129
- "original": "",
130
- "corrected": tgt_word,
131
- "confidence": conf * 100,
132
- "type": "insert"
133
- })
134
- j -= 1
 
 
135
 
136
  aligned_results.reverse()
137
  return aligned_results
 
42
  def get_similarity(s1, s2):
43
  return difflib.SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
44
 
45
+ def is_start_char_match(src, tgt):
46
+ if not src or not tgt: return False
47
+ c1 = src[0].lower()
48
+ c2 = tgt[0].lower()
49
+
50
+ if c1 == c2: return True
51
+
52
+ if c1 == 'f' and tgt.lower().startswith('ph'): return True
53
+ if c1 == 'w' and (tgt.lower().startswith('qu') or c2 == 'ư'): return True
54
+ if c1 == 'j' and (tgt.lower().startswith('gi') or c2 == 'd'): return True
55
+ if c1 == 'z' and c2 in ['d', 'r', 'v']: return True
56
+ if c1 == 'k' and c2 in ['c', 'q']: return True
57
+
58
+ return False
59
+
60
  def smart_alignment(source_words, target_words, target_confidences):
61
  n = len(source_words)
62
  m = len(target_words)
63
 
64
+ MAX_LOOKBACK = 5
65
+
66
  dp = np.zeros((n + 1, m + 1))
67
 
68
  for i in range(n + 1): dp[i][0] = i * -1.0
 
71
  for i in range(1, n + 1):
72
  for j in range(1, m + 1):
73
  src_word = source_words[i-1]
 
74
 
75
+ best_score = dp[i-1][j] - 0.5
76
 
 
77
  score_insert = dp[i][j-1] - 0.5
78
+ best_score = max(best_score, score_insert)
79
 
80
+ for k in range(1, min(j, MAX_LOOKBACK) + 1):
81
+ segment_words = target_words[j-k : j]
82
+ combined_tgt = " ".join(segment_words)
83
+
84
+ sim = get_similarity(src_word, combined_tgt)
85
+
86
+ group_bonus = 0.15 * k if k > 1 else 0
87
+
88
+ start_char_bonus = 0.0
89
+ if is_start_char_match(src_word, combined_tgt):
90
+ start_char_bonus = 0.5
91
+
92
+ match_score = dp[i-1][j-k] + sim + group_bonus + start_char_bonus - 0.2
93
+
94
+ if src_word.lower() == combined_tgt.lower():
95
+ match_score = dp[i-1][j-k] + 2.0
96
+
97
+ best_score = max(best_score, match_score)
98
 
99
  dp[i][j] = best_score
100
 
 
103
 
104
  while i > 0 or j > 0:
105
  src_word = source_words[i-1] if i > 0 else ""
 
 
106
  current_score = dp[i][j]
107
 
108
+ found_match = False
109
+
110
+ max_k_check = min(j, MAX_LOOKBACK)
111
  if i > 0 and j > 0:
112
+ for k in range(max_k_check, 0, -1):
113
+ prev_score = dp[i-1][j-k]
114
+ segment_words = target_words[j-k : j]
115
+ combined_tgt = " ".join(segment_words)
116
+
117
+ sim = get_similarity(src_word, combined_tgt)
118
+ group_bonus = 0.15 * k if k > 1 else 0
119
+
120
+ start_char_bonus = 0.0
121
+ if is_start_char_match(src_word, combined_tgt):
122
+ start_char_bonus = 0.5
123
+
124
+ match_score = prev_score + sim + group_bonus + start_char_bonus - 0.2
125
+
126
+ if src_word.lower() == combined_tgt.lower():
127
+ match_score = prev_score + 2.0
128
+
129
+ if abs(current_score - match_score) < 0.001:
130
+ confs = target_confidences[j-k : j]
131
+ avg_conf = sum(confs)/len(confs) if confs else 0.0
132
+
133
+ type_tag = 'equal' if (k == 1 and src_word.lower() == combined_tgt.lower()) else 'replace'
134
+
135
+ aligned_results.append({
136
+ "original": src_word,
137
+ "corrected": combined_tgt,
138
+ "confidence": avg_conf * 100,
139
+ "type": type_tag
140
+ })
141
+ i -= 1
142
+ j -= k
143
+ found_match = True
144
+ break
145
 
146
+ if found_match:
147
+ continue
148
+
149
  del_score = dp[i-1][j] - 0.5 if i > 0 else -999
150
+ if i > 0 and abs(current_score - del_score) < 0.001:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  aligned_results.append({
152
  "original": src_word,
153
  "corrected": "",
 
155
  "type": "delete"
156
  })
157
  i -= 1
158
+ continue
159
+
160
+ tgt_word = target_words[j-1] if j > 0 else ""
161
+ conf = target_confidences[j-1] if j > 0 else 0.0
162
+ aligned_results.append({
163
+ "original": "",
164
+ "corrected": tgt_word,
165
+ "confidence": conf * 100,
166
+ "type": "insert"
167
+ })
168
+ j -= 1
169
 
170
  aligned_results.reverse()
171
  return aligned_results