Spaces:

GannaEslam38
/

whisper_code-switching

Running

File size: 8,284 Bytes

import torch
from transformers import pipeline
import gradio as gr
import os
import sys
from google import genai
# Note: PyMuPDF is implicitly required in requirements.txt for handling large text chunks.

# ==============================================================================
# I. ASR Setup (Whisper)
# ==============================================================================

# 1. Define the model identifier
MODEL_ID = "MohamedRashad/Arabic-Whisper-CodeSwitching-Edition"

# Set the device (GPU or CPU)
device = 0 if torch.cuda.is_available() else "cpu"
print(f"Device set to use: {device}")

# 2. Load the ASR Pipeline
try:
    print("Loading ASR pipeline (Whisper) with chunking parameters...")
    
    pipeline_kwargs = {
        "chunk_length_s": 30,  # Max chunk length in seconds
    }

    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model=MODEL_ID,
        device=device,
        **pipeline_kwargs
    )
    print("Pipeline loaded successfully.")

except Exception as e:
    print(f"Error loading ASR pipeline: {e}.")
    asr_pipeline = None


# ==============================================================================
# II. Summarization Setup (Gemini)
# ==============================================================================

# Get API key from Hugging Face Space Secrets
API_KEY = os.environ.get("GEMINI_API_KEY")
if not API_KEY:
    raise ValueError("GEMINI_API_KEY is not set in Hugging Face Space Secrets.")

try:
    client = genai.Client(api_key=API_KEY)
except Exception as e:
    raise RuntimeError(f"Failed to initialize Gemini Client: {e}")

MODEL_NAME = "gemini-2.5-flash"
MAX_TOKENS_PER_CHUNK = 10000 
CHUNK_SIZE_LIMIT = int(MAX_TOKENS_PER_CHUNK * 5 * 0.9) 


# --- 2.1 Text Splitting Function (Chunking) ---
def split_text_into_chunks(text: str) -> list[str]:
    """Splits large text into smaller chunks for processing."""
    chunks = []
    current_chunk = ""
    sentences = text.split('.') 
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < CHUNK_SIZE_LIMIT:
            current_chunk += sentence + ". "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
        
    return chunks

# --- 2.2 Text Correction Function ---
def correct_and_format_text(raw_text: str) -> str:
    """Corrects spelling, grammar, and adds punctuation to raw text from ASR."""
    
    correction_prompt = f"""
    You are an expert text editor. Your task is to take raw, unpunctuated text, often from a Speech-to-Text (ASR) system, and correct it.
    
    Perform the following actions:
    1. **Fix Spelling and Grammar:** Correct all spelling, syntax, and grammatical errors.
    2. **Add Punctuation:** Insert all necessary punctuation (periods, commas, question marks, etc.) to make the text readable and clear.
    3. **Preserve Content:** DO NOT add, delete, or change any core meaning or factual information. Only correct the form.
    
    Raw Text to Correct:
    ---
    {raw_text}
    """
    
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=correction_prompt
    )
    return response.text

# --- 2.3 Smart Summarization and Merging Core Logic ---
def smart_summarize_and_merge(text_to_summarize: str) -> str:
    """Manages the full summarization pipeline: chunking, partial summarization, and final merging."""
    
    if len(text_to_summarize) > CHUNK_SIZE_LIMIT: 
        chunks = split_text_into_chunks(text_to_summarize)
    else:
        chunks = [text_to_summarize]
        
    
    partial_summaries = []
    
    # Map Step: Summarize each chunk
    for chunk in chunks:
        partial_prompt = f"""
        You are an expert summarizer. Summarize the following text into **clear, key bullet points**.
        Do not leave out any essential information. The summary must be in the same language as the source text.
        
        Source Text:
        ---
        {chunk}
        """
        
        response = client.models.generate_content(model=MODEL_NAME, contents=partial_prompt)
        partial_summaries.append(response.text)
        
    # Reduce Step: Final Summarization/Language Formatting
    if len(partial_summaries) > 1:
        combined_summaries = "\n\n--- Previous Chunk Summary ---\n\n".join(partial_summaries)
        input_for_final_prompt = combined_summaries
        prompt_type = "summarize the provided partial summaries"
    else:
        input_for_final_prompt = partial_summaries[0]
        prompt_type = "review and format the following summary"


    final_prompt = f"""
    You are a professional text summarizer. {prompt_type} into clear, comprehensive **Bullet Points**.
    Use **round bullet points (•)** for the list items.
    You must integrate all key points from all sections.
    
    Language Instructions:
    1. **If the majority of the input text was in English:** The final summary must be **strictly in English**.
    2. **If the majority of the input text was in Arabic (including dialects):** The final summary must be **in Formal Arabic**, while **strictly preserving all foreign technical terms (English) exactly as they are** without translation.
    
    Input:
    ---
    {input_for_final_prompt}
    """
    
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=final_prompt
    )
    return response.text


# ==============================================================================
# III. Full Pipeline Function (Gradio FN)
# ==============================================================================

def full_pipeline(audio_path):
    """
    Manages the full pipeline: ASR -> Correction -> Summarization.
    Outputs only the Corrected Text and the Final Summary.
    """
    if asr_pipeline is None:
        return ("Error loading the Transcription model.", "No summary generated.")
    
    if audio_path is None:
        return ("Please upload an audio file first.", "No summary generated.")

    # 1. ASR Step: Convert audio to raw text
    print(f"Step 1: Starting ASR for {audio_path}")
    try:
        asr_result = asr_pipeline(audio_path, return_timestamps=True)
        raw_asr_text = asr_result['text'].strip() if 'text' in asr_result else "Failed to extract text."
    except Exception as e:
        error_msg = f"An error occurred during the ASR process (Whisper): {e}"
        return (error_msg, "No summary generated.")

    if not raw_asr_text or raw_asr_text == "Failed to extract meaningful text.":
        return ("Transcription failed to extract meaningful text.", "No summary generated.")

    # 2. Correction Step: Correct the raw text
    print(f"Step 2: Starting text correction for {len(raw_asr_text)} characters.")
    corrected_text = correct_and_format_text(raw_asr_text)

    # 3. Summarization Step: Summarize
    print(f"Step 3: Starting smart summarization.")
    final_summary = smart_summarize_and_merge(corrected_text)

    # Cleanup Markdown
    clean_summary = final_summary.replace('**', '')

    print("Pipeline finished successfully.")
    
    # Return only Corrected Text and Summary (two outputs)
    return corrected_text, clean_summary


# ==============================================================================
# IV. Gradio UI Definition
# ==============================================================================

title = "🎙️ Audio Transcription, Correction, and Smart Summarization Tool"
description = (
    "Upload an audio file to automatically convert speech to text, correct spelling and punctuation errors, "
    "and receive a comprehensive summary in bullet points (Formal Arabic, preserving English terms)."
)

gr.Interface(
    fn=full_pipeline,
    inputs=gr.Audio(
        type="filepath",  
        label="Upload an audio file (WAV, MP3, etc.) or record directly"
    ),
    outputs=[
        # Output 1: Corrected Text
        gr.Textbox(label="Corrected and Formatted Text", lines=8),
        # Output 2: Final Summary
        gr.Textbox(label="Final Comprehensive Summary", lines=8)
    ],
    title=title,
    description=description,
    live=False
).launch()