Spaces:

guymorlan
/

TokenizerLabeller

Sleeping

App Files Files Community

guymorlan commited on Jun 19, 2023

Commit

9927ce5

1 Parent(s): c6866fc

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -19

app.py CHANGED Viewed

@@ -20,32 +20,79 @@ async () => {
 }
 """
 pipe = pipeline("translation", "guymorlan/TokenizerLabeller")
 r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
 data = json.loads(r.text)
 def predict(input):
-    out = pipe(input)[0]['translation_text']
-    raw = out
-    out = [x.strip() for x in out.split(" + ")]
     output = f"""
-        <div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>{raw}<br><br>"""
-    for o in out:
-        oo = [x.strip() for x in o.split("+")]
-        output += "<span style='background-color: #E0E0E0; border-radius: 5px; padding: 5px; margin-right: 5px; display: inline-block;'>"
-        for ooo in oo:
-            if ooo in data:
                 output += f"""
-                    <span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; padding: 2px; margin-right: 2px; font-family: "Courier New", Courier, monospace;'
-                    onmouseover='showCard(event, "{data[ooo]['translation']}", "{data[ooo]['features']}")'
-                    onmouseout='hideCard(event)' onclick='showCard(event, "{data[ooo]['translation']}", "{data[ooo]['features']}")'>{data[ooo]['word']}</span>
-                """
             else:
-                output += ooo
-        output += "</span> "
     output += "</div>"
     output += """
@@ -57,12 +104,12 @@ def predict(input):
     """
     return output
-with gr.Blocks(title="Ammiya Tokenizer and Annotator") as demo:
     gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
     with gr.Row():
         with gr.Column():
             input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1)
-            gr.Examples(["بديش اروح معك", "مكنتش هون قبل ما جيت"], input)
             btn = gr.Button(label="Analyze")
         with gr.Column():
             with gr.Box():
@@ -71,5 +118,4 @@ with gr.Blocks(title="Ammiya Tokenizer and Annotator") as demo:
     input.submit(predict, inputs = [input], outputs=[html])
     demo.load(_js=js)
-    demo.launch()

 }
 """
+def get_matches(text):
+    pred = pipe(text, max_length=5000)[0]["translation_text"]
+    def get_mapping(pred):
+        pred = pred.split(" = ")
+        pred = [x.split("+") for x in pred]
+        flat = [x for y in pred for x in y]
+        flat = [x.split(":") for x in flat]
+        return flat
+    mapping = get_mapping(pred)
+    # only keep tuples with length 2
+    mapping = [x for x in mapping if len(x) == 2]
+    matches = []
+    cur = mapping.pop(0)
+    i = 0
+    done = False
+    while i < len(text) and not done:
+        if text[i:].startswith(cur[0]):
+            matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
+            i += len(cur[0])
+            if len(mapping) == 0:
+                done = True
+            else:
+                cur = mapping.pop(0)
+        else:
+            i += 1
+    return (text, pred, matches)
 pipe = pipeline("translation", "guymorlan/TokenizerLabeller")
 r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
 data = json.loads(r.text)
 def predict(input):
+    text, pred, matches = get_matches(input)
+    matches = {x["start"]: x for x in matches}
     output = f"""
+        <div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""
+    i = 0
+    while i < len(text):
+        if i in matches:
+            match = matches[i]["lexicon"]
+            # if match ends with _R, remove _R suffix
+            if match.endswith("_R"):
+                match = match[:-2]
+            if match in data:
+                # match = matches[i]["lexicon"]
                 output += f"""
+                        <span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;'
+                        onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'
+                        onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
+                        """
             else:
+                output += matches[i]["match"]
+            i = matches[i]["end"]
+        else:
+            print(f"'{text[i]}'")
+            if text[i] == " ":
+                output += "&nbsp"
+            else:
+                output += text[i]
+            i += 1
     output += "</div>"
     output += """
     """
     return output
+with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler") as demo:
     gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
     with gr.Row():
         with gr.Column():
             input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1)
+            gr.Examples(["بديش اروح معك"], input)
             btn = gr.Button(label="Analyze")
         with gr.Column():
             with gr.Box():
     input.submit(predict, inputs = [input], outputs=[html])
     demo.load(_js=js)
+    demo.launch()