Spaces:

Rajor78
/

Transcript2Word

Sleeping

File size: 1,695 Bytes

1b4137e
ed87615
56e1e3f
 
 
 
69b32e0
 
 
56e1e3f
 
 
69b32e0
1b4137e
56e1e3f
 
 
 
 
 
 
 
69b32e0
1b4137e
 
 
 
69b32e0
1b4137e
56e1e3f
 
 
 
 
 
 
 
 
 
 
1b4137e
56e1e3f
69b32e0
1b4137e
 
 
56e1e3f
 
 
 
1b4137e
69b32e0
1b4137e

import gradio as gr
import subprocess
import os
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import language_tool_python
from pydub import AudioSegment
from docx import Document

def extract_audio(video_path, audio_path):
    command = f"ffmpeg -i '{video_path}' -ar 16000 -ac 1 -c:a pcm_s16le '{audio_path}' -y"
    subprocess.run(command, shell=True, check=True)
    return audio_path

def transcribe_audio(audio_path):
    processor = WhisperProcessor.from_pretrained("openai/whisper-base")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
    
    audio_input = processor(audio_path, return_tensors="pt", sampling_rate=16000)
    result = model.generate(**audio_input)
    transcription = processor.decode(result[0], skip_special_tokens=True)
    
    return transcription

def correct_text(text):
    tool = language_tool_python.LanguageTool('es')
    matches = tool.check(text)
    return language_tool_python.utils.correct(text, matches)

def process_video(video_file):
    video_path = video_file.name
    audio_path = os.path.splitext(video_path)[0] + '.wav'
    
    extract_audio(video_path, audio_path)
    transcribed_text = transcribe_audio(audio_path)
    corrected_text = correct_text(transcribed_text)
    
    doc = Document()
    doc.add_paragraph(corrected_text)
    doc_path = "transcription.docx"
    doc.save(doc_path)
    
    return corrected_text, doc_path

demo = gr.Interface(
    fn=process_video,
    inputs=gr.File(label="Sube un archivo de video"),
    outputs=[
        gr.Textbox(label="Texto transcrito y corregido"),
        gr.File(label="Descargar transcripción Word")
    ]
)

demo.launch()