chuuhtetnaing commited on
Commit
975e2cd
·
1 Parent(s): 1d66d1d

setup project

Browse files
Files changed (8) hide show
  1. .gitignore +23 -0
  2. .pre-commit-config.yaml +14 -0
  3. Makefile +15 -0
  4. app.py +26 -133
  5. pyproject.toml +16 -0
  6. requirements.txt +0 -6
  7. segmentation.py +123 -0
  8. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # ENV
13
+ .env.development
14
+ .env.production
15
+
16
+ # IDE
17
+ .idea
18
+
19
+ # OS
20
+ .DS_STORE
21
+
22
+ # Notebook
23
+ notebook/*.png
.pre-commit-config.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+
3
+ - repo: https://github.com/astral-sh/ruff-pre-commit
4
+ # Ruff version.
5
+ rev: v0.14.10
6
+ hooks:
7
+ # Run the linter.
8
+ - id: ruff
9
+ types_or: [ python, pyi ]
10
+ args: [ --config, ruff.toml, --fix ]
11
+ # Run the formatter.
12
+ - id: ruff-format
13
+ types_or: [ python, pyi ]
14
+ args: [ --config, ruff.toml ]
Makefile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DEFAULT_GOAL := start
2
+
3
+
4
+ setup-precommit-hook:
5
+ uv run pre-commit install
6
+ uv run pre-commit autoupdate
7
+
8
+ install: setup-precommit-hook
9
+ uv sync --group dev
10
+
11
+ lint:
12
+ uv run pre-commit run --all-files
13
+
14
+ start:
15
+ uv run gradio app.py
app.py CHANGED
@@ -1,154 +1,47 @@
1
  import gradio as gr
2
- import numpy as np
3
- import random
4
 
5
- # import spaces #[uncomment to use ZeroGPU]
6
- from diffusers import DiffusionPipeline
7
- import torch
8
 
9
- device = "cuda" if torch.cuda.is_available() else "cpu"
10
- model_repo_id = "stabilityai/sdxl-turbo" # Replace to the model you would like to use
 
11
 
12
- if torch.cuda.is_available():
13
- torch_dtype = torch.float16
14
- else:
15
- torch_dtype = torch.float32
16
-
17
- pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
18
- pipe = pipe.to(device)
19
-
20
- MAX_SEED = np.iinfo(np.int32).max
21
- MAX_IMAGE_SIZE = 1024
22
-
23
-
24
- # @spaces.GPU #[uncomment to use ZeroGPU]
25
- def infer(
26
- prompt,
27
- negative_prompt,
28
- seed,
29
- randomize_seed,
30
- width,
31
- height,
32
- guidance_scale,
33
- num_inference_steps,
34
- progress=gr.Progress(track_tqdm=True),
35
- ):
36
- if randomize_seed:
37
- seed = random.randint(0, MAX_SEED)
38
-
39
- generator = torch.Generator().manual_seed(seed)
40
-
41
- image = pipe(
42
- prompt=prompt,
43
- negative_prompt=negative_prompt,
44
- guidance_scale=guidance_scale,
45
- num_inference_steps=num_inference_steps,
46
- width=width,
47
- height=height,
48
- generator=generator,
49
- ).images[0]
50
-
51
- return image, seed
52
-
53
-
54
- examples = [
55
- "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
56
- "An astronaut riding a green horse",
57
- "A delicious ceviche cheesecake slice",
58
- ]
59
 
60
  css = """
61
  #col-container {
62
  margin: 0 auto;
63
- max-width: 640px;
 
 
 
 
 
 
 
 
 
 
64
  }
65
  """
66
 
67
  with gr.Blocks(css=css) as demo:
68
  with gr.Column(elem_id="col-container"):
69
- gr.Markdown(" # Text-to-Image Gradio Template")
70
-
71
- with gr.Row():
72
- prompt = gr.Text(
73
- label="Prompt",
74
- show_label=False,
75
- max_lines=1,
76
- placeholder="Enter your prompt",
77
- container=False,
78
- )
79
-
80
- run_button = gr.Button("Run", scale=0, variant="primary")
81
-
82
- result = gr.Image(label="Result", show_label=False)
83
 
84
- with gr.Accordion("Advanced Settings", open=False):
85
- negative_prompt = gr.Text(
86
- label="Negative prompt",
87
- max_lines=1,
88
- placeholder="Enter a negative prompt",
89
- visible=False,
90
  )
91
-
92
- seed = gr.Slider(
93
- label="Seed",
94
- minimum=0,
95
- maximum=MAX_SEED,
96
- step=1,
97
- value=0,
98
  )
99
 
100
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
101
-
102
- with gr.Row():
103
- width = gr.Slider(
104
- label="Width",
105
- minimum=256,
106
- maximum=MAX_IMAGE_SIZE,
107
- step=32,
108
- value=1024, # Replace with defaults that work for your model
109
- )
110
-
111
- height = gr.Slider(
112
- label="Height",
113
- minimum=256,
114
- maximum=MAX_IMAGE_SIZE,
115
- step=32,
116
- value=1024, # Replace with defaults that work for your model
117
- )
118
-
119
- with gr.Row():
120
- guidance_scale = gr.Slider(
121
- label="Guidance scale",
122
- minimum=0.0,
123
- maximum=10.0,
124
- step=0.1,
125
- value=0.0, # Replace with defaults that work for your model
126
- )
127
-
128
- num_inference_steps = gr.Slider(
129
- label="Number of inference steps",
130
- minimum=1,
131
- maximum=50,
132
- step=1,
133
- value=2, # Replace with defaults that work for your model
134
- )
135
 
136
- gr.Examples(examples=examples, inputs=[prompt])
137
- gr.on(
138
- triggers=[run_button.click, prompt.submit],
139
- fn=infer,
140
- inputs=[
141
- prompt,
142
- negative_prompt,
143
- seed,
144
- randomize_seed,
145
- width,
146
- height,
147
- guidance_scale,
148
- num_inference_steps,
149
- ],
150
- outputs=[result, seed],
151
- )
152
 
153
  if __name__ == "__main__":
154
  demo.launch()
 
1
  import gradio as gr
 
 
2
 
 
 
 
3
 
4
+ def segment(text: str) -> str:
5
+ """Segment Myanmar text. Currently echoes input."""
6
+ return text
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  css = """
10
  #col-container {
11
  margin: 0 auto;
12
+ max-width: 900px;
13
+ padding: 0 1rem;
14
+ }
15
+ #input-output-row {
16
+ flex-direction: row;
17
+ gap: 1rem;
18
+ }
19
+ @media (max-width: 768px) {
20
+ #input-output-row {
21
+ flex-direction: column;
22
+ }
23
  }
24
  """
25
 
26
  with gr.Blocks(css=css) as demo:
27
  with gr.Column(elem_id="col-container"):
28
+ gr.Markdown("# Myanmar Text Segmentation")
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ with gr.Row(elem_id="input-output-row", equal_height=True):
31
+ input_text = gr.Textbox(
32
+ label="Input Text",
33
+ placeholder="Enter Myanmar text here...",
34
+ lines=6,
 
35
  )
36
+ output_text = gr.Textbox(
37
+ label="Segmented Text",
38
+ lines=6,
 
 
 
 
39
  )
40
 
41
+ run_button = gr.Button("Segment", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ run_button.click(fn=segment, inputs=input_text, outputs=output_text)
44
+ input_text.submit(fn=segment, inputs=input_text, outputs=output_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  if __name__ == "__main__":
47
  demo.launch()
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "myanmar-text-segmentation-app"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "gradio>=6.2.0",
8
+ "torch>=2.9.1",
9
+ "torchvision>=0.24.1",
10
+ "transformers>=4.57.3",
11
+ ]
12
+
13
+ [dependency-groups]
14
+ dev = [
15
+ "pre-commit>=4.5.1",
16
+ ]
requirements.txt DELETED
@@ -1,6 +0,0 @@
1
- accelerate
2
- diffusers
3
- invisible_watermark
4
- torch
5
- transformers
6
- xformers
 
 
 
 
 
 
 
segmentation.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ classifier = pipeline("ner", model="chuuhtetnaing/myanmar_text_segmentation_model")
4
+
5
+
6
+ def reconstruct(tokens, labels):
7
+ """
8
+ Combine tokens based on B/I labels.
9
+ Add space before 'B' tokens (except the first one).
10
+ """
11
+ result = []
12
+ for token, label in zip(tokens, labels):
13
+ if label == "B" and result:
14
+ result.append(" ")
15
+ result.append(token)
16
+ return "".join(result)
17
+
18
+
19
+ import re
20
+
21
+
22
+ def has_myanmar(text):
23
+ return bool(re.search(r'[\u1000-\u109F]', text))
24
+
25
+
26
+ def has_latin(text):
27
+ return bool(re.search(r'[a-zA-Z]', text))
28
+
29
+
30
+ def split_myanmar_latin(chunk):
31
+ """
32
+ Split chunk at Myanmar/Latin boundaries.
33
+ - Opening brackets attach to NEXT letter's script
34
+ - Closing brackets attach to PREVIOUS letter's script
35
+ - Other symbols attach to NEXT letter's script
36
+ """
37
+ if not (has_myanmar(chunk) and has_latin(chunk)):
38
+ return [chunk]
39
+
40
+ opening_brackets = set('([{<')
41
+ closing_brackets = set(')]}>')
42
+
43
+ # First pass: determine script type for each character
44
+ char_scripts = []
45
+ for char in chunk:
46
+ if re.match(r'[\u1000-\u109F]', char):
47
+ char_scripts.append('myanmar')
48
+ elif re.match(r'[a-zA-Z0-9]', char):
49
+ char_scripts.append('latin')
50
+ else:
51
+ char_scripts.append(None) # symbol
52
+
53
+ # Second pass: assign symbols to appropriate script
54
+ assigned_scripts = char_scripts.copy()
55
+
56
+ for i, (char, script) in enumerate(zip(chunk, char_scripts)):
57
+ if script is None: # symbol
58
+ if char in opening_brackets:
59
+ # Opening bracket: attach to NEXT letter's script
60
+ for j in range(i + 1, len(chunk)):
61
+ if char_scripts[j] is not None:
62
+ assigned_scripts[i] = char_scripts[j]
63
+ break
64
+ elif char in closing_brackets:
65
+ # Closing bracket: attach to PREVIOUS letter's script
66
+ for j in range(i - 1, -1, -1):
67
+ if char_scripts[j] is not None:
68
+ assigned_scripts[i] = char_scripts[j]
69
+ break
70
+ else:
71
+ # Other symbols: attach to NEXT, fallback to PREVIOUS
72
+ for j in range(i + 1, len(chunk)):
73
+ if char_scripts[j] is not None:
74
+ assigned_scripts[i] = char_scripts[j]
75
+ break
76
+ else:
77
+ for j in range(i - 1, -1, -1):
78
+ if char_scripts[j] is not None:
79
+ assigned_scripts[i] = char_scripts[j]
80
+ break
81
+
82
+ # Third pass: group consecutive same-script characters
83
+ result = []
84
+ current = ""
85
+ current_script = None
86
+
87
+ for char, script in zip(chunk, assigned_scripts):
88
+ if current_script is None:
89
+ current = char
90
+ current_script = script
91
+ elif script == current_script or script is None:
92
+ current += char
93
+ else:
94
+ if current:
95
+ result.append(current)
96
+ current = char
97
+ current_script = script
98
+
99
+ if current:
100
+ result.append(current)
101
+
102
+ return result
103
+
104
+
105
+ def preprocess(text):
106
+ tokens = []
107
+ for chunk in text.split():
108
+ parts = split_myanmar_latin(chunk)
109
+ tokens.extend(parts)
110
+
111
+ return tokens
112
+
113
+
114
+ def segment_text(text):
115
+ classifier_result = classifier(text)
116
+ words = [r['word'] for r in classifier_result]
117
+ entities = [r['entity'] for r in classifier_result]
118
+ result = reconstruct(words, entities)
119
+ result = result.replace("▁", " ")
120
+ result = re.sub(r"\s+", " ", result).strip()
121
+
122
+ return result
123
+
uv.lock ADDED
The diff for this file is too large to render. See raw diff