Spaces:

chuuhtetnaing
/

myanmar-text-segmentation-app

Running

App Files Files Community

chuuhtetnaing commited on 13 days ago

Commit

975e2cd

1 Parent(s): 1d66d1d

setup project

Browse files

Files changed (8) hide show

.gitignore +23 -0
.pre-commit-config.yaml +14 -0
Makefile +15 -0
app.py +26 -133
pyproject.toml +16 -0
requirements.txt +0 -6
segmentation.py +123 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,23 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# ENV
+.env.development
+.env.production
+# IDE
+.idea
+# OS
+.DS_STORE
+# Notebook
+notebook/*.png

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+repos:
+-  repo: https://github.com/astral-sh/ruff-pre-commit
+   # Ruff version.
+   rev: v0.14.10
+   hooks:
+        # Run the linter.
+    -   id: ruff
+        types_or: [ python, pyi ]
+        args: [ --config, ruff.toml, --fix ]
+        # Run the formatter.
+    -   id: ruff-format
+        types_or: [ python, pyi ]
+        args: [ --config, ruff.toml ]

Makefile ADDED Viewed

	@@ -0,0 +1,15 @@

+.DEFAULT_GOAL := start
+setup-precommit-hook:
+	uv run pre-commit install
+	uv run pre-commit autoupdate
+install: setup-precommit-hook
+	uv sync --group dev
+lint:
+	uv run pre-commit run --all-files
+start:
+	uv run gradio app.py

app.py CHANGED Viewed

@@ -1,154 +1,47 @@
 import gradio as gr
-import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
 css = """
 #col-container {
     margin: 0 auto;
-    max-width: 640px;
 }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
             )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
-    )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+def segment(text: str) -> str:
+    """Segment Myanmar text. Currently echoes input."""
+    return text
 css = """
 #col-container {
     margin: 0 auto;
+    max-width: 900px;
+    padding: 0 1rem;
+}
+#input-output-row {
+    flex-direction: row;
+    gap: 1rem;
+}
+@media (max-width: 768px) {
+    #input-output-row {
+        flex-direction: column;
+    }
 }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown("# Myanmar Text Segmentation")
+        with gr.Row(elem_id="input-output-row", equal_height=True):
+            input_text = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter Myanmar text here...",
+                lines=6,
             )
+            output_text = gr.Textbox(
+                label="Segmented Text",
+                lines=6,
             )
+        run_button = gr.Button("Segment", variant="primary")
+    run_button.click(fn=segment, inputs=input_text, outputs=output_text)
+    input_text.submit(fn=segment, inputs=input_text, outputs=output_text)
 if __name__ == "__main__":
     demo.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "myanmar-text-segmentation-app"
+version = "0.1.0"
+description = "Add your description here"
+requires-python = ">=3.12"
+dependencies = [
+    "gradio>=6.2.0",
+    "torch>=2.9.1",
+    "torchvision>=0.24.1",
+    "transformers>=4.57.3",
+]
+[dependency-groups]
+dev = [
+    "pre-commit>=4.5.1",
+]

requirements.txt DELETED Viewed

@@ -1,6 +0,0 @@
-accelerate
-diffusers
-invisible_watermark
-torch
-transformers
-xformers

segmentation.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from transformers import pipeline
+classifier = pipeline("ner", model="chuuhtetnaing/myanmar_text_segmentation_model")
+def reconstruct(tokens, labels):
+    """
+    Combine tokens based on B/I labels.
+    Add space before 'B' tokens (except the first one).
+    """
+    result = []
+    for token, label in zip(tokens, labels):
+        if label == "B" and result:
+            result.append(" ")
+        result.append(token)
+    return "".join(result)
+import re
+def has_myanmar(text):
+    return bool(re.search(r'[\u1000-\u109F]', text))
+def has_latin(text):
+    return bool(re.search(r'[a-zA-Z]', text))
+def split_myanmar_latin(chunk):
+    """
+    Split chunk at Myanmar/Latin boundaries.
+    - Opening brackets attach to NEXT letter's script
+    - Closing brackets attach to PREVIOUS letter's script
+    - Other symbols attach to NEXT letter's script
+    """
+    if not (has_myanmar(chunk) and has_latin(chunk)):
+        return [chunk]
+    opening_brackets = set('([{<')
+    closing_brackets = set(')]}>')
+    # First pass: determine script type for each character
+    char_scripts = []
+    for char in chunk:
+        if re.match(r'[\u1000-\u109F]', char):
+            char_scripts.append('myanmar')
+        elif re.match(r'[a-zA-Z0-9]', char):
+            char_scripts.append('latin')
+        else:
+            char_scripts.append(None)  # symbol
+    # Second pass: assign symbols to appropriate script
+    assigned_scripts = char_scripts.copy()
+    for i, (char, script) in enumerate(zip(chunk, char_scripts)):
+        if script is None:  # symbol
+            if char in opening_brackets:
+                # Opening bracket: attach to NEXT letter's script
+                for j in range(i + 1, len(chunk)):
+                    if char_scripts[j] is not None:
+                        assigned_scripts[i] = char_scripts[j]
+                        break
+            elif char in closing_brackets:
+                # Closing bracket: attach to PREVIOUS letter's script
+                for j in range(i - 1, -1, -1):
+                    if char_scripts[j] is not None:
+                        assigned_scripts[i] = char_scripts[j]
+                        break
+            else:
+                # Other symbols: attach to NEXT, fallback to PREVIOUS
+                for j in range(i + 1, len(chunk)):
+                    if char_scripts[j] is not None:
+                        assigned_scripts[i] = char_scripts[j]
+                        break
+                else:
+                    for j in range(i - 1, -1, -1):
+                        if char_scripts[j] is not None:
+                            assigned_scripts[i] = char_scripts[j]
+                            break
+    # Third pass: group consecutive same-script characters
+    result = []
+    current = ""
+    current_script = None
+    for char, script in zip(chunk, assigned_scripts):
+        if current_script is None:
+            current = char
+            current_script = script
+        elif script == current_script or script is None:
+            current += char
+        else:
+            if current:
+                result.append(current)
+            current = char
+            current_script = script
+    if current:
+        result.append(current)
+    return result
+def preprocess(text):
+    tokens = []
+    for chunk in text.split():
+        parts = split_myanmar_latin(chunk)
+        tokens.extend(parts)
+    return tokens
+def segment_text(text):
+    classifier_result = classifier(text)
+    words = [r['word'] for r in classifier_result]
+    entities = [r['entity'] for r in classifier_result]
+    result = reconstruct(words, entities)
+    result = result.replace("▁", " ")
+    result = re.sub(r"\s+", " ", result).strip()
+    return result

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff