GannaEslam38's picture
Update app.py
dd0312e verified
import torch
from transformers import pipeline
import gradio as gr
import os
import sys
from google import genai
# Note: PyMuPDF is implicitly required in requirements.txt for handling large text chunks.
# ==============================================================================
# I. ASR Setup (Whisper)
# ==============================================================================
# 1. Define the model identifier
MODEL_ID = "MohamedRashad/Arabic-Whisper-CodeSwitching-Edition"
# Set the device (GPU or CPU)
device = 0 if torch.cuda.is_available() else "cpu"
print(f"Device set to use: {device}")
# 2. Load the ASR Pipeline
try:
print("Loading ASR pipeline (Whisper) with chunking parameters...")
pipeline_kwargs = {
"chunk_length_s": 30, # Max chunk length in seconds
}
asr_pipeline = pipeline(
"automatic-speech-recognition",
model=MODEL_ID,
device=device,
**pipeline_kwargs
)
print("Pipeline loaded successfully.")
except Exception as e:
print(f"Error loading ASR pipeline: {e}.")
asr_pipeline = None
# ==============================================================================
# II. Summarization Setup (Gemini)
# ==============================================================================
# Get API key from Hugging Face Space Secrets
API_KEY = os.environ.get("GEMINI_API_KEY")
if not API_KEY:
raise ValueError("GEMINI_API_KEY is not set in Hugging Face Space Secrets.")
try:
client = genai.Client(api_key=API_KEY)
except Exception as e:
raise RuntimeError(f"Failed to initialize Gemini Client: {e}")
MODEL_NAME = "gemini-2.5-flash"
MAX_TOKENS_PER_CHUNK = 10000
CHUNK_SIZE_LIMIT = int(MAX_TOKENS_PER_CHUNK * 5 * 0.9)
# --- 2.1 Text Splitting Function (Chunking) ---
def split_text_into_chunks(text: str) -> list[str]:
"""Splits large text into smaller chunks for processing."""
chunks = []
current_chunk = ""
sentences = text.split('.')
for sentence in sentences:
if len(current_chunk) + len(sentence) < CHUNK_SIZE_LIMIT:
current_chunk += sentence + ". "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# --- 2.2 Text Correction Function ---
def correct_and_format_text(raw_text: str) -> str:
"""Corrects spelling, grammar, and adds punctuation to raw text from ASR."""
correction_prompt = f"""
You are an expert text editor. Your task is to take raw, unpunctuated text, often from a Speech-to-Text (ASR) system, and correct it.
Perform the following actions:
1. **Fix Spelling and Grammar:** Correct all spelling, syntax, and grammatical errors.
2. **Add Punctuation:** Insert all necessary punctuation (periods, commas, question marks, etc.) to make the text readable and clear.
3. **Preserve Content:** DO NOT add, delete, or change any core meaning or factual information. Only correct the form.
Raw Text to Correct:
---
{raw_text}
"""
response = client.models.generate_content(
model=MODEL_NAME,
contents=correction_prompt
)
return response.text
# --- 2.3 Smart Summarization and Merging Core Logic ---
def smart_summarize_and_merge(text_to_summarize: str) -> str:
"""Manages the full summarization pipeline: chunking, partial summarization, and final merging."""
if len(text_to_summarize) > CHUNK_SIZE_LIMIT:
chunks = split_text_into_chunks(text_to_summarize)
else:
chunks = [text_to_summarize]
partial_summaries = []
# Map Step: Summarize each chunk
for chunk in chunks:
partial_prompt = f"""
You are an expert summarizer. Summarize the following text into **clear, key bullet points**.
Do not leave out any essential information. The summary must be in the same language as the source text.
Source Text:
---
{chunk}
"""
response = client.models.generate_content(model=MODEL_NAME, contents=partial_prompt)
partial_summaries.append(response.text)
# Reduce Step: Final Summarization/Language Formatting
if len(partial_summaries) > 1:
combined_summaries = "\n\n--- Previous Chunk Summary ---\n\n".join(partial_summaries)
input_for_final_prompt = combined_summaries
prompt_type = "summarize the provided partial summaries"
else:
input_for_final_prompt = partial_summaries[0]
prompt_type = "review and format the following summary"
final_prompt = f"""
You are a professional text summarizer. {prompt_type} into clear, comprehensive **Bullet Points**.
Use **round bullet points (•)** for the list items.
You must integrate all key points from all sections.
Language Instructions:
1. **If the majority of the input text was in English:** The final summary must be **strictly in English**.
2. **If the majority of the input text was in Arabic (including dialects):** The final summary must be **in Formal Arabic**, while **strictly preserving all foreign technical terms (English) exactly as they are** without translation.
Input:
---
{input_for_final_prompt}
"""
response = client.models.generate_content(
model=MODEL_NAME,
contents=final_prompt
)
return response.text
# ==============================================================================
# III. Full Pipeline Function (Gradio FN)
# ==============================================================================
def full_pipeline(audio_path):
"""
Manages the full pipeline: ASR -> Correction -> Summarization.
Outputs only the Corrected Text and the Final Summary.
"""
if asr_pipeline is None:
return ("Error loading the Transcription model.", "No summary generated.")
if audio_path is None:
return ("Please upload an audio file first.", "No summary generated.")
# 1. ASR Step: Convert audio to raw text
print(f"Step 1: Starting ASR for {audio_path}")
try:
asr_result = asr_pipeline(audio_path, return_timestamps=True)
raw_asr_text = asr_result['text'].strip() if 'text' in asr_result else "Failed to extract text."
except Exception as e:
error_msg = f"An error occurred during the ASR process (Whisper): {e}"
return (error_msg, "No summary generated.")
if not raw_asr_text or raw_asr_text == "Failed to extract meaningful text.":
return ("Transcription failed to extract meaningful text.", "No summary generated.")
# 2. Correction Step: Correct the raw text
print(f"Step 2: Starting text correction for {len(raw_asr_text)} characters.")
corrected_text = correct_and_format_text(raw_asr_text)
# 3. Summarization Step: Summarize
print(f"Step 3: Starting smart summarization.")
final_summary = smart_summarize_and_merge(corrected_text)
# Cleanup Markdown
clean_summary = final_summary.replace('**', '')
print("Pipeline finished successfully.")
# Return only Corrected Text and Summary (two outputs)
return corrected_text, clean_summary
# ==============================================================================
# IV. Gradio UI Definition
# ==============================================================================
title = "🎙️ Audio Transcription, Correction, and Smart Summarization Tool"
description = (
"Upload an audio file to automatically convert speech to text, correct spelling and punctuation errors, "
"and receive a comprehensive summary in bullet points (Formal Arabic, preserving English terms)."
)
gr.Interface(
fn=full_pipeline,
inputs=gr.Audio(
type="filepath",
label="Upload an audio file (WAV, MP3, etc.) or record directly"
),
outputs=[
# Output 1: Corrected Text
gr.Textbox(label="Corrected and Formatted Text", lines=8),
# Output 2: Final Summary
gr.Textbox(label="Final Comprehensive Summary", lines=8)
],
title=title,
description=description,
live=False
).launch()