Tamil78 commited on
Commit
02585c6
·
verified ·
1 Parent(s): 35753b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -22
app.py CHANGED
@@ -25,27 +25,17 @@ class ConvertResponse(BaseModel):
25
  input_text: str
26
  theni_tamil_text: str
27
 
28
- def _split_into_chunks(text: str, chunk_size: int = 250) -> list:
29
- """Safely splits a giant paragraph into smaller sentences based on periods/punctuation."""
30
  text = text.strip()
31
  if not text:
32
  return []
33
- if len(text) <= chunk_size:
34
- return [text]
35
 
 
36
  sentences = re.split(r"(?<=[.!?])\s+", text)
37
- chunks = []
38
- current = ""
39
- for sentence in sentences:
40
- if len(current) + len(sentence) + 1 <= chunk_size:
41
- current = f"{current} {sentence}".strip()
42
- else:
43
- if current:
44
- chunks.append(current)
45
- current = sentence.strip()
46
- if current:
47
- chunks.append(current)
48
- return chunks
49
 
50
  @app.on_event("startup")
51
  def load_model():
@@ -69,18 +59,18 @@ def convert(payload: ConvertRequest):
69
  raise HTTPException(400, "text is required")
70
 
71
  try:
72
- # 1. Break paragraph into bite-sized sentences
73
- chunks = _split_into_chunks(text, chunk_size=200)
74
  translated_chunks = []
75
 
76
- # 2. Translate each sentence one by one sequentially
77
- for chunk in chunks:
78
- encoded = _tokenizer(chunk, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
79
  output_ids = _model.generate(**encoded, max_length=MAX_LENGTH, num_beams=NUM_BEAMS)
80
  output_text = _tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
81
  translated_chunks.append(" ".join(str(output_text).strip().split()))
82
 
83
- # 3. Stitch them back into a giant translated paragraph!
84
  final_text = " ".join(translated_chunks).strip()
85
 
86
  return ConvertResponse(
 
25
  input_text: str
26
  theni_tamil_text: str
27
 
28
+ def _split_into_sentences(text: str) -> list:
29
+ """Forcefully splits a giant paragraph into true individual sentences aggressively."""
30
  text = text.strip()
31
  if not text:
32
  return []
 
 
33
 
34
+ # Split text exactly at periods, exclamation marks, or question marks
35
  sentences = re.split(r"(?<=[.!?])\s+", text)
36
+
37
+ # Clean up any empty spaces and return the array of individual sentences
38
+ return [s.strip() for s in sentences if s.strip()]
 
 
 
 
 
 
 
 
 
39
 
40
  @app.on_event("startup")
41
  def load_model():
 
59
  raise HTTPException(400, "text is required")
60
 
61
  try:
62
+ # 1. Break paragraph forcefully into exact sentences
63
+ sentences = _split_into_sentences(text)
64
  translated_chunks = []
65
 
66
+ # 2. Translate each individual sentence one by one sequentially
67
+ for sentence in sentences:
68
+ encoded = _tokenizer(sentence, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
69
  output_ids = _model.generate(**encoded, max_length=MAX_LENGTH, num_beams=NUM_BEAMS)
70
  output_text = _tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
71
  translated_chunks.append(" ".join(str(output_text).strip().split()))
72
 
73
+ # 3. Stitch them all back into a giant translated paragraph!
74
  final_text = " ".join(translated_chunks).strip()
75
 
76
  return ConvertResponse(