Gaoussin commited on
Commit
fec37b6
·
verified ·
1 Parent(s): 2553a5d

Upload 3 files

Browse files
Files changed (3) hide show
  1. main.py +20 -11
  2. normalize_bm_input.py +80 -0
  3. normalize_bm_output.py +56 -23
main.py CHANGED
@@ -4,8 +4,8 @@ from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
  # Note: Keep the imports together for clarity
6
  from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
7
- from normalize_bm_words import normalize_text
8
- from normalize_bm_output import normalize_output
9
 
10
  # =====================
11
  # 1️⃣ Environment / Cache
@@ -71,15 +71,24 @@ def translateTo(text, src, tgt):
71
  @app.post("/translate")
72
  def translate(request: TranslationRequest):
73
  try:
74
- is_bm = request.src_lang == "bam_Latn"
75
- text = normalize_text(request.text) if is_bm else request.text
76
-
77
- result = translateTo(text, request.src_lang, request.tgt_lang)
78
- normal_result = normalize_output(result) if is_bm else result
79
-
80
- # List structure: [translation, model_version]
81
- translation_list = [normal_result, model_name]
82
-
 
 
 
 
 
 
 
 
 
83
  return [translation_list]
84
  except Exception as e:
85
  print(f"An error occurred during translation: {e}")
 
4
  from pydantic import BaseModel
5
  # Note: Keep the imports together for clarity
6
  from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
7
+ from normalize_bm_input import normalize_bm_input
8
+ from normalize_bm_output import normalize_bm_output
9
 
10
  # =====================
11
  # 1️⃣ Environment / Cache
 
71
  @app.post("/translate")
72
  def translate(request: TranslationRequest):
73
  try:
74
+ ###
75
+ # Check if the source language is Bambara for input normalization
76
+ is_src_bm = request.src_lang == "bam_Latn"
77
+ # Check if the target language is Bambara for output normalization
78
+ is_tgt_bm = request.tgt_lang == "bam_Latn"
79
+ # --- 1. Input Normalization (Pre-processing) ---
80
+ # Normalize the input ONLY if the source language is Bambara.
81
+ text_to_translate = normalize_bm_input(request.text) if is_src_bm else request.text
82
+ print("before",text_to_translate)
83
+ # --- 2. Core Translation ---
84
+ result = translateTo(text_to_translate, request.src_lang, request.tgt_lang)
85
+ print("after",result)
86
+ # --- 3. Output Normalization (Post-processing) ---
87
+ # Normalize the result (de-contract) ONLY if the target language is Bambara.
88
+ final_translation = normalize_bm_output(result) if is_tgt_bm else result
89
+ # --- 4. Final Output ---
90
+ translation_list = [final_translation, model_name]
91
+ ###
92
  return [translation_list]
93
  except Exception as e:
94
  print(f"An error occurred during translation: {e}")
normalize_bm_input.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # Define the de-contraction dictionary.
4
+ # Keys are the contracted forms (what you want to replace).
5
+ # Values are the expanded forms (what you want to replace them with).
6
+ DE_CONTRACTIONS = {
7
+ # Keys with apostrophes/special characters for multi-word expansion
8
+ "k'a": "ka a",
9
+ "a b'a": "a be a",
10
+ "n'be": "ne be",
11
+ "n'b'a":"ne be a",
12
+ "b'a": "be a",
13
+ "k'o": "ko o", # Corrected key-value based on original request
14
+ "b'i": "be i",
15
+ "k'i":"ka i",
16
+ "k'aw":"ka aw",
17
+
18
+ # Single-word keys (no apostrophe) for multi-word expansion
19
+ "kɔkɔ": "kɔgɔ",
20
+ "bɛ": "be"
21
+ }
22
+
23
+ def normalize_bm_input(text: str) -> str:
24
+ """
25
+ De-contracts (expands) specific contracted forms in a string
26
+ based on the DE_CONTRACTIONS dictionary.
27
+ """
28
+
29
+ # 1. Ensure the text is lowercase for consistent matching
30
+ text = text.lower()
31
+
32
+ # --- Part 1: Handle Multi-Word Expansions ---
33
+
34
+ # The condition for 'multi-word expansion' must check the VALUE (the expanded form)
35
+ # not the KEY (the contracted form).
36
+ multi_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' in v}
37
+
38
+ # Sort keys (contracted forms) by length descending. This is CRUCIAL
39
+ # for regex to match longer contracted forms (e.g., "a b'a") before
40
+ # shorter ones that might be contained within them.
41
+ sorted_multi_word = sorted(multi_word_expansions.items(), key=lambda item: len(item[0]), reverse=True)
42
+
43
+ # Apply replacement for contracted forms that expand to multi-word phrases
44
+ for contracted_form, expanded_phrase in sorted_multi_word:
45
+
46
+ # Create a pattern to match the full contracted form, ensuring it's
47
+ # surrounded by word boundaries. This ensures "b'a" is not matched
48
+ # within "b'adi".
49
+ pattern = r'\b' + re.escape(contracted_form) + r'\b'
50
+
51
+ # Replace the full matched pattern with the expanded phrase
52
+ text = re.sub(pattern, expanded_phrase, text)
53
+
54
+ # --- Part 2: Handle Single-Word Expansions (e.g., 'kɔkɔ' -> 'kɔgɔ') ---
55
+
56
+ # Filter for contractions that expand to a single word (no spaces in the value)
57
+ single_word_expansions = {k: v for k, v in DE_CONTRACTIONS.items() if ' ' not in v}
58
+
59
+ def replace_single_word(match):
60
+ """Looks up the matched word (key) and returns the single-word expansion (value)."""
61
+ word = match.group(0)
62
+ # Use .get() to replace only the words present in the dictionary.
63
+ return single_word_expansions.get(word, word)
64
+
65
+ # Apply the replacement function to all whole words
66
+ # This also catches cases like kɔkɔ and bɛ.
67
+ text = re.sub(r'\b\S+\b', replace_single_word, text)
68
+
69
+ # 2. Capitalize the first letter of the result for presentation
70
+ return text[:1].upper() + text[1:]
71
+
72
+ # --- Example Usage ---
73
+
74
+ #input_text_4 = "k'a di a b'i fɛ kɔkɔ n'b'a fɔ. Bɛ jɛ."
75
+
76
+ #print(f"Original Text: {input_text_4}")
77
+ #normalized_4 = normalize_bm_input(input_text_4)
78
+ #print(f"Normalized Text: {normalized_4}\n")
79
+
80
+ # Expected Output: Ka a di a be i fɛ kɔgɔ ne be a fɔ. Be jɛ.
normalize_bm_output.py CHANGED
@@ -1,34 +1,67 @@
1
  import re
2
 
3
- # Normalize keys once (lowercase)
4
  CONTRACTIONS = {
 
5
  "ka a": "k'a",
 
6
  "be a": "b'a",
7
- "ne be": "n'be",
8
- "taa a": "ta'a",
9
- "ko o": "k'o"
 
 
10
  }
11
 
12
- # Escape + longest-first (critical for correctness)
13
- PATTERN = re.compile(
14
- r'(?<![^\W\d_])('
15
- + '|'.join(sorted(map(re.escape, CONTRACTIONS), key=len, reverse=True))
16
- + r')(?![^\W\d_])',
17
- flags=re.IGNORECASE | re.UNICODE
18
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- SPACE_QUESTION_RE = re.compile(r'\s*\?')
21
-
22
- def normalize_output(text: str) -> str:
23
- # 1) normalize spacing before ?
24
- text = SPACE_QUESTION_RE.sub(' ?', text)
25
-
26
- # 2) expand contractions
27
- text = PATTERN.sub(lambda m: CONTRACTIONS[m.group(0).lower()], text)
28
-
29
- # 3) capitalize first letter safely
 
 
 
 
 
 
 
 
30
  return text[:1].upper() + text[1:]
31
 
 
 
 
32
 
33
- #sentence = "a be ka a di ne ma ne be taa a fɛ?"
34
- #print(normalize_text(sentence))
 
 
1
  import re
2
 
3
+ # Define the contractions dictionary
4
  CONTRACTIONS = {
5
+ # Multi-word contractions (keys are space-separated)
6
  "ka a": "k'a",
7
+ "a be a": "a b'a",
8
  "be a": "b'a",
9
+ "ko o": "k'o",
10
+ "di i":"d'i",
11
+ "be i":"b'i"
12
+ # Example Single-word contraction added:
13
+ #"kaa": "k'aa" # Assuming this is a desired single-word contraction
14
  }
15
 
16
+ def normalize_bm_output(text: str) -> str:
17
+ """
18
+ Normalizes specific contractions (both single-word and multi-word)
19
+ in a string.
20
+ """
21
+
22
+ # 1. Ensure the text is lowercase as specified in your requirement
23
+ text = text.lower()
24
+
25
+ # --- Part 1: Handle Multi-Word Contractions ---
26
+
27
+ # Filter for and sort multi-word keys by length descending to prevent partial matches
28
+ multi_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' in k}
29
+ sorted_multi_word = sorted(multi_word_contractions.items(), key=lambda item: len(item[0]), reverse=True)
30
+
31
+ # Apply replacement for multi-word phrases
32
+ for original_phrase, contracted_form in sorted_multi_word:
33
+ # Create a pattern to match the full phrase, ensuring it's surrounded by
34
+ # word boundaries or start/end of string.
35
+ # re.escape handles any special characters in the key
36
+ pattern = r'\b' + re.escape(original_phrase) + r'\b'
37
+
38
+ # Replace the full matched pattern with the contracted form
39
+ text = re.sub(pattern, contracted_form, text, flags=re.IGNORECASE)
40
 
41
+ # --- Part 2: Handle Single-Word Contractions ---
42
+
43
+ # Filter for single-word keys (no spaces)
44
+ single_word_contractions = {k: v for k, v in CONTRACTIONS.items() if ' ' not in k}
45
+
46
+ # Use a regular expression and a function to map the words based on the dictionary
47
+
48
+ def replace_single_word(match):
49
+ """Looks up the matched word in the single-word contractions dictionary."""
50
+ word = match.group(0)
51
+ # Use .get() with the original word as the default to ensure non-contracted
52
+ # words are left alone.
53
+ return single_word_contractions.get(word, word)
54
+
55
+ # The pattern r'\b\w+\b' matches every single whole word in the text.
56
+ # The replacement function replace_single_word is called for every match.
57
+ text = re.sub(r'\b\w+\b', replace_single_word, text)
58
+
59
  return text[:1].upper() + text[1:]
60
 
61
+ # --- Example Usage with both types of contractions ---
62
+
63
+ #input_text_4 = "ka a di a be i fɛ kɔgɔ ne be a fɔ."
64
 
65
+ #print(f"Original Text: {input_text_4}")
66
+ #normalized_4 = normalize_bm_output(input_text_4)
67
+ #print(f"Normalized Text: {normalized_4}\n")