Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +115 -95
- predict.py +70 -66
- utils.py +166 -34
app.py
CHANGED
|
@@ -1,95 +1,115 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
from predict import predict
|
| 3 |
-
|
| 4 |
-
# Map dropdown option β model path
|
| 5 |
-
LANGUAGE_MODELS = {
|
| 6 |
-
"Odia": "models/odia-pos-16K.pkl",
|
| 7 |
-
"Punjabi": "models/punjabi-pos.pkl",
|
| 8 |
-
"Dogri": "models/dogri-pos.pkl"
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
-
def
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
}
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from predict import predict
|
| 3 |
+
|
| 4 |
+
# Map dropdown option β model path
|
| 5 |
+
LANGUAGE_MODELS = {
|
| 6 |
+
"Odia": "models/odia-pos-16K.pkl",
|
| 7 |
+
"Punjabi": "models/punjabi-pos.pkl",
|
| 8 |
+
"Dogri": "models/dogri-pos.pkl"
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
def highlight_ssf(text):
|
| 12 |
+
"""Add simple HTML highlighting for SSF structure and POS tags."""
|
| 13 |
+
import re
|
| 14 |
+
|
| 15 |
+
# Highlight sentence tags <Sentence ...> and brackets
|
| 16 |
+
text = re.sub(r"(</?Sentence[^&]*>)", r"<span style='color:green; font-style:italic;'>\1</span>", text)
|
| 17 |
+
text = re.sub(r"(\(\(|\)\))", r"<span style='color:green; font-style:italic;'>\1</span>", text)
|
| 18 |
+
|
| 19 |
+
# Highlight <fs ...>
|
| 20 |
+
text = re.sub(r"(<fs[^&]*>)", r"<span style='color:darkorange;'>\1</span>", text)
|
| 21 |
+
|
| 22 |
+
# Highlight POS tags (3rd column) β blue & bold
|
| 23 |
+
def repl_pos(match):
|
| 24 |
+
return f"{match.group(1)}<span style='color:blue; font-weight:bold;'>{match.group(2)}</span>{match.group(3)}"
|
| 25 |
+
text = re.sub(r"^(\s*\d+\t[^\t]+\t)([^\t]+)(.*)$", repl_pos, text, flags=re.MULTILINE)
|
| 26 |
+
|
| 27 |
+
return f"<pre style='font-family:monospace;'>{text}</pre>"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def process_file(language, file_obj, file_type):
|
| 31 |
+
model_path = LANGUAGE_MODELS.get(language)
|
| 32 |
+
if not model_path:
|
| 33 |
+
raise ValueError(f"No model available for {language}")
|
| 34 |
+
|
| 35 |
+
input_path = file_obj.name
|
| 36 |
+
output_path = f"result_{language}.txt"
|
| 37 |
+
|
| 38 |
+
result_file = predict(input_path, model_path, file_type, output_path)
|
| 39 |
+
|
| 40 |
+
with open(result_file, "r", encoding="utf-8") as f:
|
| 41 |
+
preview_raw = f.read(2000) # first ~2000 chars for preview
|
| 42 |
+
|
| 43 |
+
# If SSF, apply highlighting
|
| 44 |
+
if file_type == "ssf":
|
| 45 |
+
preview = highlight_ssf(preview_raw.replace("<", "<").replace(">", ">"))
|
| 46 |
+
else:
|
| 47 |
+
preview = f"<pre>{preview_raw}</pre>"
|
| 48 |
+
|
| 49 |
+
return result_file, preview
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def main():
|
| 53 |
+
|
| 54 |
+
with gr.Blocks(css="""
|
| 55 |
+
.download-box {
|
| 56 |
+
background: linear-gradient(90deg, #00c6ff, #0072ff);
|
| 57 |
+
padding: 20px;
|
| 58 |
+
border-radius: 12px;
|
| 59 |
+
text-align: center;
|
| 60 |
+
color: white;
|
| 61 |
+
font-weight: bold;
|
| 62 |
+
font-size: 18px;
|
| 63 |
+
box-shadow: 0px 4px 8px rgba(0,0,0,0.1);
|
| 64 |
+
}
|
| 65 |
+
.download-box .wrap.svelte-1ipelgc {
|
| 66 |
+
justify-content: center !important;
|
| 67 |
+
}
|
| 68 |
+
.block-label {
|
| 69 |
+
color: black !important;
|
| 70 |
+
font-size: 18px !important;
|
| 71 |
+
font-weight: 600 !important;
|
| 72 |
+
}
|
| 73 |
+
""") as demo:
|
| 74 |
+
gr.HTML(
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
+
<h1>π Multilingual POS Tagger</h1>
|
| 78 |
+
<p>Upload text or CoNLL files and get POS-tagged output</p>
|
| 79 |
+
|
| 80 |
+
"""
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
with gr.Row():
|
| 84 |
+
with gr.Column(scale=1):
|
| 85 |
+
language = gr.Dropdown(
|
| 86 |
+
["Odia", "Punjabi", "Dogri"],
|
| 87 |
+
label="π Select Language",
|
| 88 |
+
value="Odia"
|
| 89 |
+
)
|
| 90 |
+
file_in = gr.File(
|
| 91 |
+
label="π Upload Input File",
|
| 92 |
+
file_types=[".txt", ".conll"]
|
| 93 |
+
)
|
| 94 |
+
file_type = gr.Radio(
|
| 95 |
+
["plain", "conll", "ssf"],
|
| 96 |
+
label="π File Type",
|
| 97 |
+
value="plain"
|
| 98 |
+
)
|
| 99 |
+
submit = gr.Button("π Run POS Tagger", variant="primary")
|
| 100 |
+
|
| 101 |
+
with gr.Column(scale=1):
|
| 102 |
+
output_file = gr.File(label="β¬οΈ Download Tagged File", file_types=[".txt", ".conll", ".ssf"])
|
| 103 |
+
preview_text = gr.HTML(label="π Preview (first lines)")
|
| 104 |
+
|
| 105 |
+
submit.click(process_file, inputs=[language, file_in, file_type],
|
| 106 |
+
outputs=[output_file, preview_text])
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
demo.launch()
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
main()
|
predict.py
CHANGED
|
@@ -1,66 +1,70 @@
|
|
| 1 |
-
import _pickle as cPickle
|
| 2 |
-
from utils import plain_to_conll, conll_to_output
|
| 3 |
-
import os
|
| 4 |
-
|
| 5 |
-
def word_features(sent, i):
|
| 6 |
-
word = sent[i][0]
|
| 7 |
-
if i == 0: prevword = '<START>'
|
| 8 |
-
else: prevword = sent[i - 1][0]
|
| 9 |
-
if i <= 1: prev2word = '<START>'
|
| 10 |
-
else: prev2word = sent[i - 2][0]
|
| 11 |
-
if i == len(sent) - 1: nextword = '<END>'
|
| 12 |
-
else: nextword = sent[i + 1][0]
|
| 13 |
-
|
| 14 |
-
return {
|
| 15 |
-
'word': word,
|
| 16 |
-
'prevword': prevword,
|
| 17 |
-
'nextword': nextword,
|
| 18 |
-
'suff_1': word[-1:], 'suff_2': word[-2:], 'suff_3': word[-3:], 'suff_4': word[-4:],
|
| 19 |
-
'pref_1': word[:1], 'pref_2': word[:2], 'pref_3': word[:3], 'pref_4': word[:4],
|
| 20 |
-
'prev2word': prev2word
|
| 21 |
-
}
|
| 22 |
-
|
| 23 |
-
def sent2features(sent):
|
| 24 |
-
return [word_features(sent, i) for i in range(len(sent))]
|
| 25 |
-
|
| 26 |
-
def load_and_predict(input_file, model, output_file):
|
| 27 |
-
with open(model, 'rb') as fid:
|
| 28 |
-
crf = cPickle.load(fid)
|
| 29 |
-
|
| 30 |
-
test_data = []
|
| 31 |
-
with open(input_file, encoding="utf8") as fr:
|
| 32 |
-
temp = []
|
| 33 |
-
for line in fr:
|
| 34 |
-
line = line.strip()
|
| 35 |
-
if line != "":
|
| 36 |
-
chunk = (line.split("\t")[0], '')
|
| 37 |
-
temp.append(chunk)
|
| 38 |
-
else:
|
| 39 |
-
if temp:
|
| 40 |
-
test_data.append(temp)
|
| 41 |
-
temp = []
|
| 42 |
-
|
| 43 |
-
X_test1 = [sent2features(s) for s in test_data]
|
| 44 |
-
y_pred1 = crf.predict(X_test1)
|
| 45 |
-
|
| 46 |
-
with open(output_file, 'w', encoding="utf-8") as f:
|
| 47 |
-
for i in range(len(test_data)):
|
| 48 |
-
for j in range(len(test_data[i])):
|
| 49 |
-
f.write(test_data[i][j][0] + "\t" + y_pred1[i][j] + "\n")
|
| 50 |
-
f.write("\n")
|
| 51 |
-
return output_file
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def predict(input_file, model, file_type, output_file="output.txt"):
|
| 55 |
-
temp_conll = "temp_input.conll"
|
| 56 |
-
tagged_conll = "tagged_output.conll"
|
| 57 |
-
|
| 58 |
-
if file_type == "plain":
|
| 59 |
-
plain_to_conll(input_file, temp_conll)
|
| 60 |
-
load_and_predict(temp_conll, model, tagged_conll)
|
| 61 |
-
conll_to_output(tagged_conll, output_file)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import _pickle as cPickle
|
| 2 |
+
from utils import plain_to_conll, conll_to_output, ssf_to_conll, conll_to_ssf
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def word_features(sent, i):
|
| 6 |
+
word = sent[i][0]
|
| 7 |
+
if i == 0: prevword = '<START>'
|
| 8 |
+
else: prevword = sent[i - 1][0]
|
| 9 |
+
if i <= 1: prev2word = '<START>'
|
| 10 |
+
else: prev2word = sent[i - 2][0]
|
| 11 |
+
if i == len(sent) - 1: nextword = '<END>'
|
| 12 |
+
else: nextword = sent[i + 1][0]
|
| 13 |
+
|
| 14 |
+
return {
|
| 15 |
+
'word': word,
|
| 16 |
+
'prevword': prevword,
|
| 17 |
+
'nextword': nextword,
|
| 18 |
+
'suff_1': word[-1:], 'suff_2': word[-2:], 'suff_3': word[-3:], 'suff_4': word[-4:],
|
| 19 |
+
'pref_1': word[:1], 'pref_2': word[:2], 'pref_3': word[:3], 'pref_4': word[:4],
|
| 20 |
+
'prev2word': prev2word
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
def sent2features(sent):
|
| 24 |
+
return [word_features(sent, i) for i in range(len(sent))]
|
| 25 |
+
|
| 26 |
+
def load_and_predict(input_file, model, output_file):
|
| 27 |
+
with open(model, 'rb') as fid:
|
| 28 |
+
crf = cPickle.load(fid)
|
| 29 |
+
|
| 30 |
+
test_data = []
|
| 31 |
+
with open(input_file, encoding="utf8") as fr:
|
| 32 |
+
temp = []
|
| 33 |
+
for line in fr:
|
| 34 |
+
line = line.strip()
|
| 35 |
+
if line != "":
|
| 36 |
+
chunk = (line.split("\t")[0], '')
|
| 37 |
+
temp.append(chunk)
|
| 38 |
+
else:
|
| 39 |
+
if temp:
|
| 40 |
+
test_data.append(temp)
|
| 41 |
+
temp = []
|
| 42 |
+
|
| 43 |
+
X_test1 = [sent2features(s) for s in test_data]
|
| 44 |
+
y_pred1 = crf.predict(X_test1)
|
| 45 |
+
|
| 46 |
+
with open(output_file, 'w', encoding="utf-8") as f:
|
| 47 |
+
for i in range(len(test_data)):
|
| 48 |
+
for j in range(len(test_data[i])):
|
| 49 |
+
f.write(test_data[i][j][0] + "\t" + y_pred1[i][j] + "\n")
|
| 50 |
+
f.write("\n")
|
| 51 |
+
return output_file
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def predict(input_file, model, file_type, output_file="output.txt"):
|
| 55 |
+
temp_conll = "temp_input.conll"
|
| 56 |
+
tagged_conll = "tagged_output.conll"
|
| 57 |
+
|
| 58 |
+
if file_type == "plain":
|
| 59 |
+
plain_to_conll(input_file, temp_conll)
|
| 60 |
+
load_and_predict(temp_conll, model, tagged_conll)
|
| 61 |
+
conll_to_output(tagged_conll, output_file)
|
| 62 |
+
elif file_type == "ssf":
|
| 63 |
+
ssf_to_conll(input_file, temp_conll)
|
| 64 |
+
load_and_predict(temp_conll, model, tagged_conll)
|
| 65 |
+
conll_to_ssf(tagged_conll, input_file, output_file)
|
| 66 |
+
else:
|
| 67 |
+
load_and_predict(input_file, model, tagged_conll)
|
| 68 |
+
os.replace(tagged_conll, output_file)
|
| 69 |
+
|
| 70 |
+
return output_file
|
utils.py
CHANGED
|
@@ -1,34 +1,166 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
with open(input_file, "r", encoding="utf-8") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
|
| 6 |
-
for line in f_in:
|
| 7 |
-
line = line.strip()
|
| 8 |
-
if not line:
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
# ---------- Plain & CoNLL (unchanged/safer) ----------
|
| 4 |
+
def plain_to_conll(input_file, temp_file):
|
| 5 |
+
with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
|
| 6 |
+
for line in f_in:
|
| 7 |
+
line = line.strip()
|
| 8 |
+
if not line:
|
| 9 |
+
f_out.write("\n")
|
| 10 |
+
continue
|
| 11 |
+
for tok in line.split():
|
| 12 |
+
f_out.write(f"{tok}\t\n")
|
| 13 |
+
f_out.write("\n")
|
| 14 |
+
|
| 15 |
+
def conll_to_output(conll_file, output_file):
|
| 16 |
+
with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
|
| 17 |
+
sent = []
|
| 18 |
+
for line in f_in:
|
| 19 |
+
line = line.rstrip("\n")
|
| 20 |
+
if not line:
|
| 21 |
+
if sent:
|
| 22 |
+
f_out.write(" ".join(sent) + "\n")
|
| 23 |
+
sent = []
|
| 24 |
+
continue
|
| 25 |
+
parts = line.split("\t")
|
| 26 |
+
if len(parts) >= 2:
|
| 27 |
+
sent.append(f"{parts[0]}_{parts[1]}")
|
| 28 |
+
if sent:
|
| 29 |
+
f_out.write(" ".join(sent) + "\n")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ---------- SSF helpers (robust) ----------
|
| 33 |
+
_token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$")
|
| 34 |
+
|
| 35 |
+
def _is_structure(line: str) -> bool:
|
| 36 |
+
s = line.strip()
|
| 37 |
+
return (
|
| 38 |
+
s == "" or
|
| 39 |
+
s.startswith("<") or # <Sentence ...>, </Sentence>, XML-ish tags
|
| 40 |
+
s.startswith("((") or
|
| 41 |
+
s.startswith("))")
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
def _parse_token_line(raw: str):
|
| 45 |
+
"""
|
| 46 |
+
Return (idx, token, pos, rest, used_tabs) or None if not a token line.
|
| 47 |
+
- Works with tabs or spaces.
|
| 48 |
+
- 'rest' is any trailing columns (e.g., <fs ...>).
|
| 49 |
+
- used_tabs: True if original line used tabs (preserve layout).
|
| 50 |
+
"""
|
| 51 |
+
used_tabs = ("\t" in raw)
|
| 52 |
+
parts_tab = raw.split("\t") if used_tabs else None
|
| 53 |
+
|
| 54 |
+
if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit():
|
| 55 |
+
idx = parts_tab[0].strip()
|
| 56 |
+
token = parts_tab[1].strip() if len(parts_tab) >= 2 else ""
|
| 57 |
+
pos = parts_tab[2].strip() if len(parts_tab) >= 3 else ""
|
| 58 |
+
rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else ""
|
| 59 |
+
return idx, token, pos, rest, True
|
| 60 |
+
|
| 61 |
+
m = _token_line_re.match(raw)
|
| 62 |
+
if m:
|
| 63 |
+
idx, token, pos, rest = m.groups()
|
| 64 |
+
return idx, token, (pos or ""), (rest or ""), False
|
| 65 |
+
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
def ssf_to_conll(input_file, temp_file):
|
| 69 |
+
"""
|
| 70 |
+
Convert SSF (XML-style or classic) into CoNLL tokens.
|
| 71 |
+
- Only lines whose first column is an integer are treated as tokens.
|
| 72 |
+
- Writes a blank line at sentence boundaries.
|
| 73 |
+
"""
|
| 74 |
+
with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
|
| 75 |
+
wrote_any_in_sentence = False
|
| 76 |
+
for raw in f_in:
|
| 77 |
+
line = raw.rstrip("\n")
|
| 78 |
+
|
| 79 |
+
# Sentence boundaries: start/end tags or classic brackets trigger newline
|
| 80 |
+
if line.strip().startswith("<Sentence"):
|
| 81 |
+
if wrote_any_in_sentence:
|
| 82 |
+
f_out.write("\n")
|
| 83 |
+
wrote_any_in_sentence = False
|
| 84 |
+
continue
|
| 85 |
+
if line.strip().startswith("</Sentence>") or line.strip().startswith("))"):
|
| 86 |
+
if wrote_any_in_sentence:
|
| 87 |
+
f_out.write("\n")
|
| 88 |
+
wrote_any_in_sentence = False
|
| 89 |
+
continue
|
| 90 |
+
if line.strip().startswith("(("):
|
| 91 |
+
if wrote_any_in_sentence:
|
| 92 |
+
f_out.write("\n")
|
| 93 |
+
wrote_any_in_sentence = False
|
| 94 |
+
continue
|
| 95 |
+
|
| 96 |
+
if _is_structure(line):
|
| 97 |
+
# blank or structural lines: ignore but do not break sentence unless handled above
|
| 98 |
+
continue
|
| 99 |
+
|
| 100 |
+
parsed = _parse_token_line(line)
|
| 101 |
+
if parsed:
|
| 102 |
+
_, token, _, _, _ = parsed
|
| 103 |
+
f_out.write(f"{token}\t\n")
|
| 104 |
+
wrote_any_in_sentence = True
|
| 105 |
+
|
| 106 |
+
# ensure trailing sentence closure gets a newline
|
| 107 |
+
if wrote_any_in_sentence:
|
| 108 |
+
f_out.write("\n")
|
| 109 |
+
|
| 110 |
+
def conll_to_ssf(conll_file, ssf_input_file, output_file):
|
| 111 |
+
"""
|
| 112 |
+
Merge CRF predictions back into SSF.
|
| 113 |
+
- Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., <fs ...>).
|
| 114 |
+
- Preserves original tabs vs spaces layout when possible.
|
| 115 |
+
"""
|
| 116 |
+
# Gather predictions (ignore blank lines)
|
| 117 |
+
preds = []
|
| 118 |
+
with open(conll_file, "r", encoding="utf-8") as f_in:
|
| 119 |
+
for line in f_in:
|
| 120 |
+
line = line.strip()
|
| 121 |
+
if not line:
|
| 122 |
+
continue
|
| 123 |
+
parts = line.split("\t")
|
| 124 |
+
if len(parts) >= 2:
|
| 125 |
+
preds.append((parts[0], parts[1])) # (token, pos)
|
| 126 |
+
|
| 127 |
+
p = 0
|
| 128 |
+
with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
|
| 129 |
+
for raw in f_in:
|
| 130 |
+
line = raw.rstrip("\n")
|
| 131 |
+
|
| 132 |
+
# Write structural lines untouched
|
| 133 |
+
if _is_structure(line):
|
| 134 |
+
f_out.write(line + "\n")
|
| 135 |
+
continue
|
| 136 |
+
|
| 137 |
+
parsed = _parse_token_line(line)
|
| 138 |
+
if not parsed:
|
| 139 |
+
# Not a recognizable token line; write as-is
|
| 140 |
+
f_out.write(line + "\n")
|
| 141 |
+
continue
|
| 142 |
+
|
| 143 |
+
idx, token, old_pos, rest, used_tabs = parsed
|
| 144 |
+
|
| 145 |
+
# If we have a prediction, replace POS; otherwise keep old POS
|
| 146 |
+
if p < len(preds):
|
| 147 |
+
_, new_pos = preds[p]
|
| 148 |
+
p += 1
|
| 149 |
+
else:
|
| 150 |
+
new_pos = old_pos if old_pos else "UNK"
|
| 151 |
+
|
| 152 |
+
if used_tabs:
|
| 153 |
+
# preserve original tabbed structure
|
| 154 |
+
parts = line.split("\t")
|
| 155 |
+
# Ensure at least 3 columns
|
| 156 |
+
while len(parts) < 3:
|
| 157 |
+
parts.append("")
|
| 158 |
+
parts[2] = new_pos
|
| 159 |
+
out = "\t".join(parts)
|
| 160 |
+
else:
|
| 161 |
+
# Normalize to tabs for clarity if original used spaces
|
| 162 |
+
out = f"{idx}\t{token}\t{new_pos}"
|
| 163 |
+
if rest:
|
| 164 |
+
out += f"\t{rest}"
|
| 165 |
+
|
| 166 |
+
f_out.write(out + "\n")
|