|
|
import torch |
|
|
from transformers import pipeline |
|
|
import gradio as gr |
|
|
import os |
|
|
import sys |
|
|
from google import genai |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "MohamedRashad/Arabic-Whisper-CodeSwitching-Edition" |
|
|
|
|
|
|
|
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
|
print(f"Device set to use: {device}") |
|
|
|
|
|
|
|
|
try: |
|
|
print("Loading ASR pipeline (Whisper) with chunking parameters...") |
|
|
|
|
|
pipeline_kwargs = { |
|
|
"chunk_length_s": 30, |
|
|
} |
|
|
|
|
|
asr_pipeline = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model=MODEL_ID, |
|
|
device=device, |
|
|
**pipeline_kwargs |
|
|
) |
|
|
print("Pipeline loaded successfully.") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error loading ASR pipeline: {e}.") |
|
|
asr_pipeline = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
API_KEY = os.environ.get("GEMINI_API_KEY") |
|
|
if not API_KEY: |
|
|
raise ValueError("GEMINI_API_KEY is not set in Hugging Face Space Secrets.") |
|
|
|
|
|
try: |
|
|
client = genai.Client(api_key=API_KEY) |
|
|
except Exception as e: |
|
|
raise RuntimeError(f"Failed to initialize Gemini Client: {e}") |
|
|
|
|
|
MODEL_NAME = "gemini-2.5-flash" |
|
|
MAX_TOKENS_PER_CHUNK = 10000 |
|
|
CHUNK_SIZE_LIMIT = int(MAX_TOKENS_PER_CHUNK * 5 * 0.9) |
|
|
|
|
|
|
|
|
|
|
|
def split_text_into_chunks(text: str) -> list[str]: |
|
|
"""Splits large text into smaller chunks for processing.""" |
|
|
chunks = [] |
|
|
current_chunk = "" |
|
|
sentences = text.split('.') |
|
|
|
|
|
for sentence in sentences: |
|
|
if len(current_chunk) + len(sentence) < CHUNK_SIZE_LIMIT: |
|
|
current_chunk += sentence + ". " |
|
|
else: |
|
|
if current_chunk: |
|
|
chunks.append(current_chunk.strip()) |
|
|
current_chunk = sentence + ". " |
|
|
|
|
|
if current_chunk: |
|
|
chunks.append(current_chunk.strip()) |
|
|
|
|
|
return chunks |
|
|
|
|
|
|
|
|
def correct_and_format_text(raw_text: str) -> str: |
|
|
"""Corrects spelling, grammar, and adds punctuation to raw text from ASR.""" |
|
|
|
|
|
correction_prompt = f""" |
|
|
You are an expert text editor. Your task is to take raw, unpunctuated text, often from a Speech-to-Text (ASR) system, and correct it. |
|
|
|
|
|
Perform the following actions: |
|
|
1. **Fix Spelling and Grammar:** Correct all spelling, syntax, and grammatical errors. |
|
|
2. **Add Punctuation:** Insert all necessary punctuation (periods, commas, question marks, etc.) to make the text readable and clear. |
|
|
3. **Preserve Content:** DO NOT add, delete, or change any core meaning or factual information. Only correct the form. |
|
|
|
|
|
Raw Text to Correct: |
|
|
--- |
|
|
{raw_text} |
|
|
""" |
|
|
|
|
|
response = client.models.generate_content( |
|
|
model=MODEL_NAME, |
|
|
contents=correction_prompt |
|
|
) |
|
|
return response.text |
|
|
|
|
|
|
|
|
def smart_summarize_and_merge(text_to_summarize: str) -> str: |
|
|
"""Manages the full summarization pipeline: chunking, partial summarization, and final merging.""" |
|
|
|
|
|
if len(text_to_summarize) > CHUNK_SIZE_LIMIT: |
|
|
chunks = split_text_into_chunks(text_to_summarize) |
|
|
else: |
|
|
chunks = [text_to_summarize] |
|
|
|
|
|
|
|
|
partial_summaries = [] |
|
|
|
|
|
|
|
|
for chunk in chunks: |
|
|
partial_prompt = f""" |
|
|
You are an expert summarizer. Summarize the following text into **clear, key bullet points**. |
|
|
Do not leave out any essential information. The summary must be in the same language as the source text. |
|
|
|
|
|
Source Text: |
|
|
--- |
|
|
{chunk} |
|
|
""" |
|
|
|
|
|
response = client.models.generate_content(model=MODEL_NAME, contents=partial_prompt) |
|
|
partial_summaries.append(response.text) |
|
|
|
|
|
|
|
|
if len(partial_summaries) > 1: |
|
|
combined_summaries = "\n\n--- Previous Chunk Summary ---\n\n".join(partial_summaries) |
|
|
input_for_final_prompt = combined_summaries |
|
|
prompt_type = "summarize the provided partial summaries" |
|
|
else: |
|
|
input_for_final_prompt = partial_summaries[0] |
|
|
prompt_type = "review and format the following summary" |
|
|
|
|
|
|
|
|
final_prompt = f""" |
|
|
You are a professional text summarizer. {prompt_type} into clear, comprehensive **Bullet Points**. |
|
|
Use **round bullet points (•)** for the list items. |
|
|
You must integrate all key points from all sections. |
|
|
|
|
|
Language Instructions: |
|
|
1. **If the majority of the input text was in English:** The final summary must be **strictly in English**. |
|
|
2. **If the majority of the input text was in Arabic (including dialects):** The final summary must be **in Formal Arabic**, while **strictly preserving all foreign technical terms (English) exactly as they are** without translation. |
|
|
|
|
|
Input: |
|
|
--- |
|
|
{input_for_final_prompt} |
|
|
""" |
|
|
|
|
|
response = client.models.generate_content( |
|
|
model=MODEL_NAME, |
|
|
contents=final_prompt |
|
|
) |
|
|
return response.text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def full_pipeline(audio_path): |
|
|
""" |
|
|
Manages the full pipeline: ASR -> Correction -> Summarization. |
|
|
Outputs only the Corrected Text and the Final Summary. |
|
|
""" |
|
|
if asr_pipeline is None: |
|
|
return ("Error loading the Transcription model.", "No summary generated.") |
|
|
|
|
|
if audio_path is None: |
|
|
return ("Please upload an audio file first.", "No summary generated.") |
|
|
|
|
|
|
|
|
print(f"Step 1: Starting ASR for {audio_path}") |
|
|
try: |
|
|
asr_result = asr_pipeline(audio_path, return_timestamps=True) |
|
|
raw_asr_text = asr_result['text'].strip() if 'text' in asr_result else "Failed to extract text." |
|
|
except Exception as e: |
|
|
error_msg = f"An error occurred during the ASR process (Whisper): {e}" |
|
|
return (error_msg, "No summary generated.") |
|
|
|
|
|
if not raw_asr_text or raw_asr_text == "Failed to extract meaningful text.": |
|
|
return ("Transcription failed to extract meaningful text.", "No summary generated.") |
|
|
|
|
|
|
|
|
print(f"Step 2: Starting text correction for {len(raw_asr_text)} characters.") |
|
|
corrected_text = correct_and_format_text(raw_asr_text) |
|
|
|
|
|
|
|
|
print(f"Step 3: Starting smart summarization.") |
|
|
final_summary = smart_summarize_and_merge(corrected_text) |
|
|
|
|
|
|
|
|
clean_summary = final_summary.replace('**', '') |
|
|
|
|
|
print("Pipeline finished successfully.") |
|
|
|
|
|
|
|
|
return corrected_text, clean_summary |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
title = "🎙️ Audio Transcription, Correction, and Smart Summarization Tool" |
|
|
description = ( |
|
|
"Upload an audio file to automatically convert speech to text, correct spelling and punctuation errors, " |
|
|
"and receive a comprehensive summary in bullet points (Formal Arabic, preserving English terms)." |
|
|
) |
|
|
|
|
|
gr.Interface( |
|
|
fn=full_pipeline, |
|
|
inputs=gr.Audio( |
|
|
type="filepath", |
|
|
label="Upload an audio file (WAV, MP3, etc.) or record directly" |
|
|
), |
|
|
outputs=[ |
|
|
|
|
|
gr.Textbox(label="Corrected and Formatted Text", lines=8), |
|
|
|
|
|
gr.Textbox(label="Final Comprehensive Summary", lines=8) |
|
|
], |
|
|
title=title, |
|
|
description=description, |
|
|
live=False |
|
|
).launch() |