import torch from transformers import pipeline import gradio as gr import os import sys from google import genai # Note: PyMuPDF is implicitly required in requirements.txt for handling large text chunks. # ============================================================================== # I. ASR Setup (Whisper) # ============================================================================== # 1. Define the model identifier MODEL_ID = "MohamedRashad/Arabic-Whisper-CodeSwitching-Edition" # Set the device (GPU or CPU) device = 0 if torch.cuda.is_available() else "cpu" print(f"Device set to use: {device}") # 2. Load the ASR Pipeline try: print("Loading ASR pipeline (Whisper) with chunking parameters...") pipeline_kwargs = { "chunk_length_s": 30, # Max chunk length in seconds } asr_pipeline = pipeline( "automatic-speech-recognition", model=MODEL_ID, device=device, **pipeline_kwargs ) print("Pipeline loaded successfully.") except Exception as e: print(f"Error loading ASR pipeline: {e}.") asr_pipeline = None # ============================================================================== # II. Summarization Setup (Gemini) # ============================================================================== # Get API key from Hugging Face Space Secrets API_KEY = os.environ.get("GEMINI_API_KEY") if not API_KEY: raise ValueError("GEMINI_API_KEY is not set in Hugging Face Space Secrets.") try: client = genai.Client(api_key=API_KEY) except Exception as e: raise RuntimeError(f"Failed to initialize Gemini Client: {e}") MODEL_NAME = "gemini-2.5-flash" MAX_TOKENS_PER_CHUNK = 10000 CHUNK_SIZE_LIMIT = int(MAX_TOKENS_PER_CHUNK * 5 * 0.9) # --- 2.1 Text Splitting Function (Chunking) --- def split_text_into_chunks(text: str) -> list[str]: """Splits large text into smaller chunks for processing.""" chunks = [] current_chunk = "" sentences = text.split('.') for sentence in sentences: if len(current_chunk) + len(sentence) < CHUNK_SIZE_LIMIT: current_chunk += sentence + ". " else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + ". " if current_chunk: chunks.append(current_chunk.strip()) return chunks # --- 2.2 Text Correction Function --- def correct_and_format_text(raw_text: str) -> str: """Corrects spelling, grammar, and adds punctuation to raw text from ASR.""" correction_prompt = f""" You are an expert text editor. Your task is to take raw, unpunctuated text, often from a Speech-to-Text (ASR) system, and correct it. Perform the following actions: 1. **Fix Spelling and Grammar:** Correct all spelling, syntax, and grammatical errors. 2. **Add Punctuation:** Insert all necessary punctuation (periods, commas, question marks, etc.) to make the text readable and clear. 3. **Preserve Content:** DO NOT add, delete, or change any core meaning or factual information. Only correct the form. Raw Text to Correct: --- {raw_text} """ response = client.models.generate_content( model=MODEL_NAME, contents=correction_prompt ) return response.text # --- 2.3 Smart Summarization and Merging Core Logic --- def smart_summarize_and_merge(text_to_summarize: str) -> str: """Manages the full summarization pipeline: chunking, partial summarization, and final merging.""" if len(text_to_summarize) > CHUNK_SIZE_LIMIT: chunks = split_text_into_chunks(text_to_summarize) else: chunks = [text_to_summarize] partial_summaries = [] # Map Step: Summarize each chunk for chunk in chunks: partial_prompt = f""" You are an expert summarizer. Summarize the following text into **clear, key bullet points**. Do not leave out any essential information. The summary must be in the same language as the source text. Source Text: --- {chunk} """ response = client.models.generate_content(model=MODEL_NAME, contents=partial_prompt) partial_summaries.append(response.text) # Reduce Step: Final Summarization/Language Formatting if len(partial_summaries) > 1: combined_summaries = "\n\n--- Previous Chunk Summary ---\n\n".join(partial_summaries) input_for_final_prompt = combined_summaries prompt_type = "summarize the provided partial summaries" else: input_for_final_prompt = partial_summaries[0] prompt_type = "review and format the following summary" final_prompt = f""" You are a professional text summarizer. {prompt_type} into clear, comprehensive **Bullet Points**. Use **round bullet points (•)** for the list items. You must integrate all key points from all sections. Language Instructions: 1. **If the majority of the input text was in English:** The final summary must be **strictly in English**. 2. **If the majority of the input text was in Arabic (including dialects):** The final summary must be **in Formal Arabic**, while **strictly preserving all foreign technical terms (English) exactly as they are** without translation. Input: --- {input_for_final_prompt} """ response = client.models.generate_content( model=MODEL_NAME, contents=final_prompt ) return response.text # ============================================================================== # III. Full Pipeline Function (Gradio FN) # ============================================================================== def full_pipeline(audio_path): """ Manages the full pipeline: ASR -> Correction -> Summarization. Outputs only the Corrected Text and the Final Summary. """ if asr_pipeline is None: return ("Error loading the Transcription model.", "No summary generated.") if audio_path is None: return ("Please upload an audio file first.", "No summary generated.") # 1. ASR Step: Convert audio to raw text print(f"Step 1: Starting ASR for {audio_path}") try: asr_result = asr_pipeline(audio_path, return_timestamps=True) raw_asr_text = asr_result['text'].strip() if 'text' in asr_result else "Failed to extract text." except Exception as e: error_msg = f"An error occurred during the ASR process (Whisper): {e}" return (error_msg, "No summary generated.") if not raw_asr_text or raw_asr_text == "Failed to extract meaningful text.": return ("Transcription failed to extract meaningful text.", "No summary generated.") # 2. Correction Step: Correct the raw text print(f"Step 2: Starting text correction for {len(raw_asr_text)} characters.") corrected_text = correct_and_format_text(raw_asr_text) # 3. Summarization Step: Summarize print(f"Step 3: Starting smart summarization.") final_summary = smart_summarize_and_merge(corrected_text) # Cleanup Markdown clean_summary = final_summary.replace('**', '') print("Pipeline finished successfully.") # Return only Corrected Text and Summary (two outputs) return corrected_text, clean_summary # ============================================================================== # IV. Gradio UI Definition # ============================================================================== title = "🎙️ Audio Transcription, Correction, and Smart Summarization Tool" description = ( "Upload an audio file to automatically convert speech to text, correct spelling and punctuation errors, " "and receive a comprehensive summary in bullet points (Formal Arabic, preserving English terms)." ) gr.Interface( fn=full_pipeline, inputs=gr.Audio( type="filepath", label="Upload an audio file (WAV, MP3, etc.) or record directly" ), outputs=[ # Output 1: Corrected Text gr.Textbox(label="Corrected and Formatted Text", lines=8), # Output 2: Final Summary gr.Textbox(label="Final Comprehensive Summary", lines=8) ], title=title, description=description, live=False ).launch()