Spaces:

vyluong
/

PoC_ASR_v5

Sleeping

App Files Files Community

colab-user commited on Jan 13

Commit

7658895

1 Parent(s): 27f5fbd

Fix format download & transcribe segments

Browse files

Files changed (3) hide show

app/api/routes.py +24 -10
app/schemas/models.py +1 -1
app/services/processor.py +4 -4

app/api/routes.py CHANGED Viewed

@@ -4,6 +4,7 @@ API routes for the transcription service.
 import logging
 import time
 from pathlib import Path
 from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Form
 from fastapi.responses import FileResponse
@@ -44,7 +45,7 @@ async def get_models():
 async def transcribe_audio(
     background_tasks: BackgroundTasks,
     file: UploadFile = File(..., description="Audio file to transcribe"),
-    model: str = Form(default="EraX-WoW-Turbo", description="Whisper model to use"),
     language: str = Form(default="vi", description="Language code"),
     merge_segments: bool = Form(default=True, description="Merge consecutive speaker segments"),
 ):
@@ -81,14 +82,27 @@ async def transcribe_audio(
         )
         # Save output files
-        txt_filename = f"{upload_path.stem}_transcript.txt"
-        srt_filename = f"{upload_path.stem}_transcript.srt"
         txt_path = settings.processed_dir / txt_filename
-        srt_path = settings.processed_dir / srt_filename
         txt_path.write_text(result.txt_content, encoding="utf-8")
-        srt_path.write_text(result.srt_content, encoding="utf-8")
         # Schedule cleanup
         background_tasks.add_task(cleanup_files, upload_path)
@@ -109,7 +123,7 @@ async def transcribe_audio(
             duration=result.duration,
             processing_time=result.processing_time,
             download_txt=f"/api/download/{txt_filename}",
-            download_srt=f"/api/download/{srt_filename}",
         )
     except HTTPException:
@@ -128,7 +142,7 @@ async def download_file(filename: str):
     Supports: .txt, .srt files
     """
     # Security: only allow specific extensions and no path traversal
-    if not filename.endswith(('.txt', '.srt')) or '/' in filename or '..' in filename:
         raise HTTPException(status_code=400, detail="Invalid filename")
     filepath = settings.processed_dir / filename
@@ -137,7 +151,7 @@ async def download_file(filename: str):
         raise HTTPException(status_code=404, detail="File not found")
     # Determine media type
-    media_type = "text/plain" if filename.endswith('.txt') else "application/x-subrip"
     return FileResponse(
         path=filepath,

 import logging
 import time
 from pathlib import Path
+import csv
 from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Form
 from fastapi.responses import FileResponse
 async def transcribe_audio(
     background_tasks: BackgroundTasks,
     file: UploadFile = File(..., description="Audio file to transcribe"),
+    model: str = Form(default="PhoWhisper Large", description="Whisper model to use"),
     language: str = Form(default="vi", description="Language code"),
     merge_segments: bool = Form(default=True, description="Merge consecutive speaker segments"),
 ):
         )
         # Save output files
+        txt_filename = f"{upload_path.stem}_transcript.txt"
         txt_path = settings.processed_dir / txt_filename
         txt_path.write_text(result.txt_content, encoding="utf-8")
+        csv_filename = f"{upload_path.stem}_transcript.csv"
+        csv_path = settings.processed_dir / csv_filename
+        with csv_path.open("w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=["start", "end", "speaker", "text"]
+            )
+            writer.writeheader()
+            for seg in result.segments:
+                writer.writerow({
+                    "start": round(seg.start, 2),
+                    "end": round(seg.end, 2),
+                    "speaker": seg.speaker,
+                    "text": seg.text
+                })
         # Schedule cleanup
         background_tasks.add_task(cleanup_files, upload_path)
             duration=result.duration,
             processing_time=result.processing_time,
             download_txt=f"/api/download/{txt_filename}",
+            download_csv=f"/api/download/{csv_filename}",
         )
     except HTTPException:
     Supports: .txt, .srt files
     """
     # Security: only allow specific extensions and no path traversal
+    if not filename.endswith(('.txt', '.csv')) or '/' in filename or '..' in filename:
         raise HTTPException(status_code=400, detail="Invalid filename")
     filepath = settings.processed_dir / filename
         raise HTTPException(status_code=404, detail="File not found")
     # Determine media type
+    media_type = "text/csv" if filename.endswith('.txt') else "application/x-subrip"
     return FileResponse(
         path=filepath,

app/schemas/models.py CHANGED Viewed

@@ -56,7 +56,7 @@ class TranscriptionResponse(BaseModel):
     speaker_count: int = Field(default=0, description="Number of detected speakers")
     processing_time: float = Field(default=0.0, description="Processing time in seconds")
     download_txt: Optional[str] = Field(default=None, description="Download URL for TXT file")
-    download_srt: Optional[str] = Field(default=None, description="Download URL for SRT file")
 class ErrorResponse(BaseModel):

     speaker_count: int = Field(default=0, description="Number of detected speakers")
     processing_time: float = Field(default=0.0, description="Processing time in seconds")
     download_txt: Optional[str] = Field(default=None, description="Download URL for TXT file")
+    download_csv: Optional[str] = Field(default=None, description="Download URL for CSV file")
 class ErrorResponse(BaseModel):

app/services/processor.py CHANGED Viewed

@@ -39,7 +39,7 @@ class ProcessingResult:
     # Output files
     txt_content: str = ""
-    srt_content: str = ""
 def convert_audio_to_wav(audio_path: Path) -> Path:
@@ -150,7 +150,7 @@ class Processor:
     async def process_audio(
         cls,
         audio_path: Path,
-        model_name: str = "EraX-WoW-Turbo",
         language: str = "vi",
         merge_segments: bool = True,
         # VAD options
@@ -275,7 +275,7 @@ class Processor:
         # Step 6: Generate outputs
         txt_content = cls._generate_txt(processed_segments, unique_speakers, processing_time, duration)
-        srt_content = cls._generate_srt(processed_segments)
         # Cleanup WAV if different from original
         if wav_path != audio_path and wav_path.exists():
@@ -290,7 +290,7 @@ class Processor:
             duration=duration,
             processing_time=processing_time,
             txt_content=txt_content,
-            srt_content=srt_content
         )
     @classmethod

     # Output files
     txt_content: str = ""
+    csv_content: str = ""
 def convert_audio_to_wav(audio_path: Path) -> Path:
     async def process_audio(
         cls,
         audio_path: Path,
+        model_name: str = "PhoWhisper Large",
         language: str = "vi",
         merge_segments: bool = True,
         # VAD options
         # Step 6: Generate outputs
         txt_content = cls._generate_txt(processed_segments, unique_speakers, processing_time, duration)
+        csv_content = cls._generate_csv(processed_segments)
         # Cleanup WAV if different from original
         if wav_path != audio_path and wav_path.exists():
             duration=duration,
             processing_time=processing_time,
             txt_content=txt_content,
+            csv_content=csv_content
         )
     @classmethod