marcosremar2 commited on
Commit
1e3ff26
·
1 Parent(s): 3722cb7

Implementar UFPAlign com segmentação de sílabas - Substituir MFA por UFPAlign para alinhamento específico do português brasileiro - Adicionar interface para mostrar início e fim de cada sílaba - Exibir informações detalhadas de fonemas, palavras e sílabas - Usar repositório oficial do UFPAlign da UFPA - Interface otimizada para visualização de segmentação silábica

Browse files
Files changed (2) hide show
  1. Dockerfile +73 -20
  2. app.py +162 -71
Dockerfile CHANGED
@@ -1,9 +1,53 @@
1
- # Montreal Forced Aligner with Portuguese Models
2
- FROM mmcauliffe/montreal-forced-aligner:latest
3
 
4
- LABEL maintainer="MFA Portuguese Alignment - Hugging Face Spaces"
 
 
5
 
6
- # Install additional Python packages for FastAPI (Python 3.12 compatible versions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  RUN pip install --no-cache-dir \
8
  fastapi \
9
  uvicorn \
@@ -11,28 +55,37 @@ RUN pip install --no-cache-dir \
11
  pydantic \
12
  textgrid \
13
  pandas \
14
- numpy
 
 
 
15
 
16
- # Create workspace
17
- WORKDIR /app
18
-
19
- # Download Portuguese models during build
20
- RUN mfa model download dictionary portuguese_mfa && \
21
- mfa model download acoustic portuguese_mfa && \
22
- mfa model download g2p portuguese_brazil_mfa
23
 
24
  # Copy application files
25
- COPY app.py /app/
26
- COPY README.md /app/
 
 
 
 
 
27
 
28
- # Create uploads directory
29
- RUN mkdir -p /app/uploads /app/output
 
 
 
 
30
 
31
  # Expose port
32
  EXPOSE 7860
33
 
34
- # Set environment variable for Gradio
35
- ENV GRADIO_SERVER_NAME="0.0.0.0"
 
36
 
37
- # Run the application
38
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # UFPAlign Brazilian Portuguese Speech Alignment Container
2
+ FROM kaldiasr/kaldi:latest
3
 
4
+ LABEL maintainer="UFPAlign Hugging Face Space"
5
+ LABEL description="UFPAlign - Brazilian Portuguese Forced Phonetic Alignment Tool"
6
+ LABEL version="1.0"
7
 
8
+ # Set environment variables
9
+ ENV UFPALIGN_DIR=/opt/UFPAlign
10
+ ENV KALDI_ROOT=/opt/kaldi
11
+ ENV LC_ALL=pt_BR.UTF-8
12
+ ENV LANG=pt_BR.UTF-8
13
+ ENV PYTHONPATH=/opt/UFPAlign:$PYTHONPATH
14
+
15
+ # Update system and install dependencies
16
+ RUN apt-get update && \
17
+ apt-get install -y --no-install-recommends \
18
+ sudo \
19
+ curl \
20
+ wget \
21
+ openjdk-8-jdk \
22
+ locales \
23
+ python3-pip \
24
+ python3-dev \
25
+ python3-setuptools \
26
+ build-essential \
27
+ sox \
28
+ ffmpeg \
29
+ git \
30
+ unzip && \
31
+ # Configure locale for Portuguese (Brazil)
32
+ sed -i '/pt_BR.UTF-8/s/^# //g' /etc/locale.gen && \
33
+ locale-gen && \
34
+ # Upgrade pip
35
+ python3 -m pip install --upgrade pip && \
36
+ # Cleanup
37
+ apt-get clean && \
38
+ rm -rf /var/lib/apt/lists/*
39
+
40
+ # Create UFPAlign directory
41
+ RUN mkdir -p ${UFPALIGN_DIR}
42
+
43
+ # Clone UFPAlign repository
44
+ RUN cd /opt && \
45
+ git clone https://github.com/falabrasil/ufpalign.git UFPAlign && \
46
+ cd UFPAlign && \
47
+ # Make the shell script executable
48
+ chmod +x ufpalign.sh
49
+
50
+ # Install Python dependencies for UFPAlign and FastAPI
51
  RUN pip install --no-cache-dir \
52
  fastapi \
53
  uvicorn \
 
55
  pydantic \
56
  textgrid \
57
  pandas \
58
+ numpy \
59
+ scikit-learn \
60
+ scipy \
61
+ matplotlib
62
 
63
+ # Install additional UFPAlign Python requirements if they exist
64
+ RUN cd ${UFPALIGN_DIR} && \
65
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
 
 
 
 
66
 
67
  # Copy application files
68
+ COPY app.py /app/app.py
69
+
70
+ # Create necessary directories
71
+ RUN mkdir -p /app/uploads /app/output /app/logs
72
+
73
+ # Set working directory
74
+ WORKDIR /app
75
 
76
+ # Make sure UFPAlign models and dictionaries are available
77
+ RUN cd ${UFPALIGN_DIR} && \
78
+ # Check if demo files exist and are accessible
79
+ ls -la demo/ || echo "Demo directory not found" && \
80
+ # Ensure scripts are executable
81
+ find . -name "*.sh" -exec chmod +x {} \;
82
 
83
  # Expose port
84
  EXPOSE 7860
85
 
86
+ # Health check
87
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
88
+ CMD curl -f http://localhost:7860/health || exit 1
89
 
90
+ # Start the application
91
+ CMD ["python3", "app.py"]
app.py CHANGED
@@ -1,10 +1,10 @@
1
  #!/usr/bin/env python3
2
  """
3
- Montreal Forced Aligner - Portuguese Brazilian Speech Alignment
4
  FastAPI Application for Hugging Face Spaces
5
 
6
- Uses the Montreal Forced Aligner (MFA) with pre-trained Portuguese models
7
- for high-quality forced speech alignment.
8
  """
9
 
10
  import os
@@ -27,8 +27,8 @@ logger = logging.getLogger(__name__)
27
 
28
  # Initialize FastAPI app
29
  app = FastAPI(
30
- title="MFA Portuguese Alignment",
31
- description="Portuguese Brazilian speech alignment using Montreal Forced Aligner",
32
  version="1.0.0"
33
  )
34
 
@@ -38,9 +38,9 @@ OUTPUT_DIR = Path("/app/output")
38
  UPLOAD_DIR.mkdir(exist_ok=True)
39
  OUTPUT_DIR.mkdir(exist_ok=True)
40
 
41
- def run_mfa_alignment(audio_path: str, text_content: str, output_dir: str) -> tuple[bool, str, str]:
42
  """
43
- Run Montreal Forced Aligner on the input audio and text.
44
 
45
  Args:
46
  audio_path: Path to the audio file
@@ -51,59 +51,64 @@ def run_mfa_alignment(audio_path: str, text_content: str, output_dir: str) -> tu
51
  Tuple of (success, textgrid_path, error_message)
52
  """
53
  try:
54
- # Create temporary directory for MFA processing
55
  with tempfile.TemporaryDirectory() as temp_dir:
56
  temp_path = Path(temp_dir)
57
 
58
- # Create MFA corpus structure
59
- corpus_dir = temp_path / "corpus"
60
- corpus_dir.mkdir()
61
 
62
  # Copy audio file
63
  audio_name = Path(audio_path).stem
64
- shutil.copy2(audio_path, corpus_dir / f"{audio_name}.wav")
 
65
 
66
  # Create text file
67
- text_file = corpus_dir / f"{audio_name}.txt"
68
  with open(text_file, 'w', encoding='utf-8') as f:
69
  f.write(text_content.strip())
70
 
71
  # Create output directory
72
- alignment_dir = temp_path / "alignment"
73
  alignment_dir.mkdir()
74
 
75
- logger.info(f"🎯 Running MFA alignment for: {audio_name}")
76
  logger.info(f"📝 Text: {text_content[:100]}...")
77
 
78
- # Run MFA alignment
79
  cmd = [
80
- "mfa", "align",
81
- str(corpus_dir),
82
- "portuguese_mfa", # Use pre-downloaded Portuguese dictionary model
83
- "portuguese_mfa", # Use pre-downloaded Portuguese acoustic model
84
- str(alignment_dir),
85
- "--clean"
86
  ]
87
 
88
- logger.info(f"🚀 MFA Command: {' '.join(cmd)}")
89
 
90
  result = subprocess.run(
91
  cmd,
92
  capture_output=True,
93
  text=True,
94
- timeout=300 # 5 minute timeout
 
95
  )
96
 
97
  if result.returncode != 0:
98
- error_msg = f"MFA alignment failed: {result.stderr}"
99
  logger.error(error_msg)
100
  return False, "", error_msg
101
 
102
- # Find the generated TextGrid file
103
  textgrid_file = alignment_dir / f"{audio_name}.TextGrid"
104
 
 
 
 
 
 
105
  if not textgrid_file.exists():
106
- error_msg = f"TextGrid file not found: {textgrid_file}"
107
  logger.error(error_msg)
108
  return False, "", error_msg
109
 
@@ -111,24 +116,24 @@ def run_mfa_alignment(audio_path: str, text_content: str, output_dir: str) -> tu
111
  output_path = Path(output_dir) / f"{audio_name}.TextGrid"
112
  shutil.copy2(textgrid_file, output_path)
113
 
114
- logger.info(f"✅ Alignment completed: {output_path}")
115
  return True, str(output_path), ""
116
 
117
  except subprocess.TimeoutExpired:
118
- return False, "", "MFA alignment timed out after 5 minutes"
119
  except Exception as e:
120
- logger.error(f"MFA alignment error: {str(e)}")
121
  return False, "", f"Alignment error: {str(e)}"
122
 
123
- def parse_textgrid_to_json(textgrid_path: str) -> Dict:
124
  """
125
- Parse TextGrid file and extract alignment information.
126
 
127
  Args:
128
  textgrid_path: Path to the TextGrid file
129
 
130
  Returns:
131
- Dictionary with alignment data
132
  """
133
  try:
134
  # Load TextGrid
@@ -137,9 +142,13 @@ def parse_textgrid_to_json(textgrid_path: str) -> Dict:
137
  result = {
138
  "filename": Path(textgrid_path).name,
139
  "duration": tg.maxTime,
 
 
 
140
  "tiers": []
141
  }
142
 
 
143
  for tier in tg:
144
  tier_data = {
145
  "name": tier.name,
@@ -149,12 +158,39 @@ def parse_textgrid_to_json(textgrid_path: str) -> Dict:
149
 
150
  if hasattr(tier, 'intervals'):
151
  for interval in tier:
152
- tier_data["intervals"].append({
153
  "start": round(interval.minTime, 3),
154
  "end": round(interval.maxTime, 3),
155
  "duration": round(interval.maxTime - interval.minTime, 3),
156
  "text": interval.mark
157
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  result["tiers"].append(tier_data)
160
 
@@ -171,10 +207,10 @@ async def main_interface():
171
  <!DOCTYPE html>
172
  <html>
173
  <head>
174
- <title>MFA Portuguese Alignment</title>
175
  <meta charset="UTF-8">
176
  <style>
177
- body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
178
  .header { text-align: center; margin-bottom: 30px; }
179
  .form-group { margin-bottom: 20px; }
180
  label { display: block; margin-bottom: 5px; font-weight: bold; }
@@ -185,20 +221,25 @@ async def main_interface():
185
  .info { background: #f8f9fa; padding: 15px; border-radius: 4px; margin-bottom: 20px; }
186
  .result { background: #d4edda; padding: 15px; border-radius: 4px; margin-top: 20px; }
187
  .error { background: #f8d7da; padding: 15px; border-radius: 4px; margin-top: 20px; }
 
 
 
 
188
  </style>
189
  </head>
190
  <body>
191
  <div class="header">
192
- <h1>🎯 MFA Portuguese Alignment</h1>
193
- <p>Montreal Forced Aligner com modelos portugueses pré-treinados</p>
194
  </div>
195
 
196
  <div class="info">
197
- <h3>📋 Instruções:</h3>
198
  <ul>
199
- <li><strong>Áudio:</strong> Arquivo WAV, preferível 16kHz mono</li>
200
- <li><strong>Texto:</strong> Transcrição exata do áudio em português</li>
201
- <li><strong>Resultado:</strong> Arquivo TextGrid com alinhamento fonético</li>
 
202
  </ul>
203
  </div>
204
 
@@ -213,7 +254,7 @@ async def main_interface():
213
  <textarea id="text" name="text" placeholder="Digite aqui o texto exato que está sendo falado no áudio..." required></textarea>
214
  </div>
215
 
216
- <button type="submit">🚀 Executar Alinhamento</button>
217
  </form>
218
 
219
  <div id="result"></div>
@@ -235,7 +276,7 @@ async def main_interface():
235
  formData.append('text', text);
236
 
237
  const resultDiv = document.getElementById('result');
238
- resultDiv.innerHTML = '<div class="info">⏳ Processando alinhamento... Isso pode levar alguns minutos.</div>';
239
 
240
  try {
241
  const response = await fetch('/align', {
@@ -246,18 +287,71 @@ async def main_interface():
246
  const result = await response.json();
247
 
248
  if (response.ok) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  resultDiv.innerHTML = `
250
  <div class="result">
251
- <h3>✅ Alinhamento Concluído!</h3>
252
  <p><strong>Arquivo:</strong> ${result.filename}</p>
253
- <p><strong>Duração:</strong> ${result.duration.toFixed(2)}s</p>
254
- <p><strong>Tiers encontradas:</strong> ${result.tiers.length}</p>
255
  <a href="/download/${result.filename.replace('.TextGrid', '')}" target="_blank">
256
  📥 Download TextGrid
257
  </a>
258
- <details style="margin-top: 15px;">
259
- <summary>📊 Visualizar Dados de Alinhamento</summary>
260
- <pre style="background: #f8f9fa; padding: 10px; overflow-x: auto;">${JSON.stringify(result, null, 2)}</pre>
 
 
 
 
 
261
  </details>
262
  </div>
263
  `;
@@ -289,7 +383,7 @@ async def align_audio(
289
  text: str = Form(...)
290
  ):
291
  """
292
- Perform forced alignment on uploaded audio and text.
293
  """
294
  try:
295
  # Validate file type
@@ -309,8 +403,8 @@ async def align_audio(
309
  logger.info(f"📁 Arquivo salvo: {audio_path}")
310
  logger.info(f"📝 Texto: {text[:100]}...")
311
 
312
- # Run alignment
313
- success, textgrid_path, error_msg = run_mfa_alignment(
314
  str(audio_path),
315
  text,
316
  str(OUTPUT_DIR)
@@ -319,8 +413,8 @@ async def align_audio(
319
  if not success:
320
  raise HTTPException(status_code=500, detail=error_msg)
321
 
322
- # Parse TextGrid and return results
323
- result = parse_textgrid_to_json(textgrid_path)
324
 
325
  # Cleanup uploaded file
326
  try:
@@ -360,29 +454,26 @@ async def download_textgrid(filename: str):
360
  @app.get("/health")
361
  async def health_check():
362
  """Health check endpoint."""
363
- return {"status": "healthy", "aligner": "Montreal Forced Aligner", "language": "Portuguese"}
364
 
365
  @app.get("/models")
366
- async def list_models():
367
- """List available MFA models."""
368
  try:
369
- # Check acoustic models
370
- acoustic_result = subprocess.run(
371
- ["mfa", "model", "list", "acoustic"],
372
- capture_output=True,
373
- text=True
374
- )
375
-
376
- # Check G2P models
377
- g2p_result = subprocess.run(
378
- ["mfa", "model", "list", "g2p"],
379
  capture_output=True,
380
  text=True
381
  )
382
 
383
  return {
384
- "acoustic_models": acoustic_result.stdout.split('\n') if acoustic_result.returncode == 0 else [],
385
- "g2p_models": g2p_result.stdout.split('\n') if g2p_result.returncode == 0 else []
 
 
 
 
386
  }
387
  except Exception as e:
388
  return {"error": str(e)}
 
1
  #!/usr/bin/env python3
2
  """
3
+ UFPAlign - Brazilian Portuguese Speech Alignment
4
  FastAPI Application for Hugging Face Spaces
5
 
6
+ Uses UFPAlign (Universidade Federal do Pará) for high-quality forced speech alignment
7
+ specifically designed for Brazilian Portuguese, with detailed syllable information.
8
  """
9
 
10
  import os
 
27
 
28
  # Initialize FastAPI app
29
  app = FastAPI(
30
+ title="UFPAlign Portuguese Syllable Alignment",
31
+ description="Brazilian Portuguese speech alignment using UFPAlign with detailed syllable segmentation",
32
  version="1.0.0"
33
  )
34
 
 
38
  UPLOAD_DIR.mkdir(exist_ok=True)
39
  OUTPUT_DIR.mkdir(exist_ok=True)
40
 
41
+ def run_ufpalign_alignment(audio_path: str, text_content: str, output_dir: str) -> tuple[bool, str, str]:
42
  """
43
+ Run UFPAlign on the input audio and text.
44
 
45
  Args:
46
  audio_path: Path to the audio file
 
51
  Tuple of (success, textgrid_path, error_message)
52
  """
53
  try:
54
+ # Create temporary directory for UFPAlign processing
55
  with tempfile.TemporaryDirectory() as temp_dir:
56
  temp_path = Path(temp_dir)
57
 
58
+ # Create UFPAlign input structure
59
+ input_dir = temp_path / "input"
60
+ input_dir.mkdir()
61
 
62
  # Copy audio file
63
  audio_name = Path(audio_path).stem
64
+ audio_input = input_dir / f"{audio_name}.wav"
65
+ shutil.copy2(audio_path, audio_input)
66
 
67
  # Create text file
68
+ text_file = input_dir / f"{audio_name}.txt"
69
  with open(text_file, 'w', encoding='utf-8') as f:
70
  f.write(text_content.strip())
71
 
72
  # Create output directory
73
+ alignment_dir = temp_path / "output"
74
  alignment_dir.mkdir()
75
 
76
+ logger.info(f"🎯 Running UFPAlign for: {audio_name}")
77
  logger.info(f"📝 Text: {text_content[:100]}...")
78
 
79
+ # Run UFPAlign using the shell script
80
  cmd = [
81
+ "/opt/UFPAlign/ufpalign.sh",
82
+ str(audio_input),
83
+ str(text_file),
84
+ str(alignment_dir)
 
 
85
  ]
86
 
87
+ logger.info(f"🚀 UFPAlign Command: {' '.join(cmd)}")
88
 
89
  result = subprocess.run(
90
  cmd,
91
  capture_output=True,
92
  text=True,
93
+ timeout=600, # 10 minute timeout
94
+ cwd="/opt/UFPAlign"
95
  )
96
 
97
  if result.returncode != 0:
98
+ error_msg = f"UFPAlign failed: {result.stderr}\nSTDOUT: {result.stdout}"
99
  logger.error(error_msg)
100
  return False, "", error_msg
101
 
102
+ # Look for the generated TextGrid file
103
  textgrid_file = alignment_dir / f"{audio_name}.TextGrid"
104
 
105
+ # UFPAlign might create files with different names, so let's search
106
+ textgrid_files = list(alignment_dir.glob("*.TextGrid"))
107
+ if textgrid_files:
108
+ textgrid_file = textgrid_files[0]
109
+
110
  if not textgrid_file.exists():
111
+ error_msg = f"TextGrid file not found in: {alignment_dir}. Available files: {list(alignment_dir.iterdir())}"
112
  logger.error(error_msg)
113
  return False, "", error_msg
114
 
 
116
  output_path = Path(output_dir) / f"{audio_name}.TextGrid"
117
  shutil.copy2(textgrid_file, output_path)
118
 
119
+ logger.info(f"✅ UFPAlign completed: {output_path}")
120
  return True, str(output_path), ""
121
 
122
  except subprocess.TimeoutExpired:
123
+ return False, "", "UFPAlign timed out after 10 minutes"
124
  except Exception as e:
125
+ logger.error(f"UFPAlign error: {str(e)}")
126
  return False, "", f"Alignment error: {str(e)}"
127
 
128
+ def parse_textgrid_to_syllable_info(textgrid_path: str) -> Dict:
129
  """
130
+ Parse TextGrid file and extract detailed syllable information from UFPAlign.
131
 
132
  Args:
133
  textgrid_path: Path to the TextGrid file
134
 
135
  Returns:
136
+ Dictionary with detailed syllable alignment data
137
  """
138
  try:
139
  # Load TextGrid
 
142
  result = {
143
  "filename": Path(textgrid_path).name,
144
  "duration": tg.maxTime,
145
+ "syllables": [],
146
+ "phonemes": [],
147
+ "words": [],
148
  "tiers": []
149
  }
150
 
151
+ # Process each tier
152
  for tier in tg:
153
  tier_data = {
154
  "name": tier.name,
 
158
 
159
  if hasattr(tier, 'intervals'):
160
  for interval in tier:
161
+ interval_data = {
162
  "start": round(interval.minTime, 3),
163
  "end": round(interval.maxTime, 3),
164
  "duration": round(interval.maxTime - interval.minTime, 3),
165
  "text": interval.mark
166
+ }
167
+ tier_data["intervals"].append(interval_data)
168
+
169
+ # Categorize based on tier name
170
+ if "sil" in tier.name.lower() or "syllable" in tier.name.lower():
171
+ if interval.mark.strip() and interval.mark.strip() != "":
172
+ result["syllables"].append({
173
+ "syllable": interval.mark,
174
+ "start_time": round(interval.minTime, 3),
175
+ "end_time": round(interval.maxTime, 3),
176
+ "duration": round(interval.maxTime - interval.minTime, 3)
177
+ })
178
+ elif "phone" in tier.name.lower() or "fone" in tier.name.lower():
179
+ if interval.mark.strip() and interval.mark.strip() != "":
180
+ result["phonemes"].append({
181
+ "phoneme": interval.mark,
182
+ "start_time": round(interval.minTime, 3),
183
+ "end_time": round(interval.maxTime, 3),
184
+ "duration": round(interval.maxTime - interval.minTime, 3)
185
+ })
186
+ elif "word" in tier.name.lower() or "palavra" in tier.name.lower():
187
+ if interval.mark.strip() and interval.mark.strip() != "":
188
+ result["words"].append({
189
+ "word": interval.mark,
190
+ "start_time": round(interval.minTime, 3),
191
+ "end_time": round(interval.maxTime, 3),
192
+ "duration": round(interval.maxTime - interval.minTime, 3)
193
+ })
194
 
195
  result["tiers"].append(tier_data)
196
 
 
207
  <!DOCTYPE html>
208
  <html>
209
  <head>
210
+ <title>UFPAlign - Segmentação de Sílabas</title>
211
  <meta charset="UTF-8">
212
  <style>
213
+ body { font-family: Arial, sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px; }
214
  .header { text-align: center; margin-bottom: 30px; }
215
  .form-group { margin-bottom: 20px; }
216
  label { display: block; margin-bottom: 5px; font-weight: bold; }
 
221
  .info { background: #f8f9fa; padding: 15px; border-radius: 4px; margin-bottom: 20px; }
222
  .result { background: #d4edda; padding: 15px; border-radius: 4px; margin-top: 20px; }
223
  .error { background: #f8d7da; padding: 15px; border-radius: 4px; margin-top: 20px; }
224
+ .syllable-item { background: #e7f3ff; padding: 10px; margin: 5px 0; border-radius: 4px; border-left: 4px solid #007bff; }
225
+ .syllable-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 10px; margin-top: 15px; }
226
+ .tier-section { margin-bottom: 20px; }
227
+ .tier-title { font-weight: bold; color: #333; margin-bottom: 10px; }
228
  </style>
229
  </head>
230
  <body>
231
  <div class="header">
232
+ <h1>🎯 UFPAlign - Segmentação de Sílabas</h1>
233
+ <p>Alinhamento fonético brasileiro com informações detalhadas de sílabas</p>
234
  </div>
235
 
236
  <div class="info">
237
+ <h3>📋 Sobre o UFPAlign:</h3>
238
  <ul>
239
+ <li><strong>Desenvolvido pela UFPA:</strong> Especificamente para português brasileiro</li>
240
+ <li><strong>Sílabas:</strong> Mostra início e fim de cada sílaba com precisão</li>
241
+ <li><strong>Multi-camadas:</strong> Fonemas, sílabas, palavras e transcrições</li>
242
+ <li><strong>Áudio:</strong> Arquivo WAV, preferencialmente 16kHz mono</li>
243
  </ul>
244
  </div>
245
 
 
254
  <textarea id="text" name="text" placeholder="Digite aqui o texto exato que está sendo falado no áudio..." required></textarea>
255
  </div>
256
 
257
+ <button type="submit">🚀 Executar Segmentação de Sílabas</button>
258
  </form>
259
 
260
  <div id="result"></div>
 
276
  formData.append('text', text);
277
 
278
  const resultDiv = document.getElementById('result');
279
+ resultDiv.innerHTML = '<div class="info">⏳ Processando com UFPAlign... Isso pode levar alguns minutos.</div>';
280
 
281
  try {
282
  const response = await fetch('/align', {
 
287
  const result = await response.json();
288
 
289
  if (response.ok) {
290
+ let syllableHtml = '';
291
+ if (result.syllables && result.syllables.length > 0) {
292
+ syllableHtml = '<div class="tier-section"><div class="tier-title">🔤 Sílabas Identificadas:</div><div class="syllable-grid">';
293
+ result.syllables.forEach((syl, index) => {
294
+ syllableHtml += `
295
+ <div class="syllable-item">
296
+ <strong>Sílaba ${index + 1}:</strong> "${syl.syllable}"<br>
297
+ <strong>Início:</strong> ${syl.start_time}s<br>
298
+ <strong>Fim:</strong> ${syl.end_time}s<br>
299
+ <strong>Duração:</strong> ${syl.duration}s
300
+ </div>
301
+ `;
302
+ });
303
+ syllableHtml += '</div></div>';
304
+ }
305
+
306
+ let phonemeHtml = '';
307
+ if (result.phonemes && result.phonemes.length > 0) {
308
+ phonemeHtml = '<div class="tier-section"><div class="tier-title">🔊 Fonemas:</div><div class="syllable-grid">';
309
+ result.phonemes.forEach((ph, index) => {
310
+ phonemeHtml += `
311
+ <div class="syllable-item" style="background: #fff3cd;">
312
+ <strong>Fonema ${index + 1}:</strong> "${ph.phoneme}"<br>
313
+ <strong>Início:</strong> ${ph.start_time}s<br>
314
+ <strong>Fim:</strong> ${ph.end_time}s<br>
315
+ <strong>Duração:</strong> ${ph.duration}s
316
+ </div>
317
+ `;
318
+ });
319
+ phonemeHtml += '</div></div>';
320
+ }
321
+
322
+ let wordHtml = '';
323
+ if (result.words && result.words.length > 0) {
324
+ wordHtml = '<div class="tier-section"><div class="tier-title">📝 Palavras:</div><div class="syllable-grid">';
325
+ result.words.forEach((word, index) => {
326
+ wordHtml += `
327
+ <div class="syllable-item" style="background: #d1ecf1;">
328
+ <strong>Palavra ${index + 1}:</strong> "${word.word}"<br>
329
+ <strong>Início:</strong> ${word.start_time}s<br>
330
+ <strong>Fim:</strong> ${word.end_time}s<br>
331
+ <strong>Duração:</strong> ${word.duration}s
332
+ </div>
333
+ `;
334
+ });
335
+ wordHtml += '</div></div>';
336
+ }
337
+
338
  resultDiv.innerHTML = `
339
  <div class="result">
340
+ <h3>✅ Segmentação Concluída com UFPAlign!</h3>
341
  <p><strong>Arquivo:</strong> ${result.filename}</p>
342
+ <p><strong>Duração Total:</strong> ${result.duration.toFixed(2)}s</p>
343
+ <p><strong>Camadas encontradas:</strong> ${result.tiers.length}</p>
344
  <a href="/download/${result.filename.replace('.TextGrid', '')}" target="_blank">
345
  📥 Download TextGrid
346
  </a>
347
+
348
+ ${syllableHtml}
349
+ ${wordHtml}
350
+ ${phonemeHtml}
351
+
352
+ <details style="margin-top: 20px;">
353
+ <summary>📊 Dados Completos de Alinhamento</summary>
354
+ <pre style="background: #f8f9fa; padding: 10px; overflow-x: auto; max-height: 400px;">${JSON.stringify(result, null, 2)}</pre>
355
  </details>
356
  </div>
357
  `;
 
383
  text: str = Form(...)
384
  ):
385
  """
386
+ Perform forced alignment on uploaded audio and text using UFPAlign.
387
  """
388
  try:
389
  # Validate file type
 
403
  logger.info(f"📁 Arquivo salvo: {audio_path}")
404
  logger.info(f"📝 Texto: {text[:100]}...")
405
 
406
+ # Run UFPAlign alignment
407
+ success, textgrid_path, error_msg = run_ufpalign_alignment(
408
  str(audio_path),
409
  text,
410
  str(OUTPUT_DIR)
 
413
  if not success:
414
  raise HTTPException(status_code=500, detail=error_msg)
415
 
416
+ # Parse TextGrid and return results with syllable information
417
+ result = parse_textgrid_to_syllable_info(textgrid_path)
418
 
419
  # Cleanup uploaded file
420
  try:
 
454
  @app.get("/health")
455
  async def health_check():
456
  """Health check endpoint."""
457
+ return {"status": "healthy", "aligner": "UFPAlign", "language": "Brazilian Portuguese"}
458
 
459
  @app.get("/models")
460
+ async def list_ufpalign_info():
461
+ """List UFPAlign information."""
462
  try:
463
+ # Check if UFPAlign is available
464
+ result = subprocess.run(
465
+ ["ls", "/opt/UFPAlign/"],
 
 
 
 
 
 
 
466
  capture_output=True,
467
  text=True
468
  )
469
 
470
  return {
471
+ "aligner": "UFPAlign",
472
+ "version": "Kaldi-based",
473
+ "language": "Brazilian Portuguese",
474
+ "features": ["Syllable segmentation", "Phoneme alignment", "Word boundaries"],
475
+ "available": result.returncode == 0,
476
+ "ufpalign_files": result.stdout.split('\n') if result.returncode == 0 else []
477
  }
478
  except Exception as e:
479
  return {"error": str(e)}