Spaces:

GannaEslam38
/

speech-text-summary-api

Sleeping

App Files Files Community

speech-text-summary-api / api.py

GannaEslam38

Update api.py

69219a2 verified about 1 month ago

raw

history blame contribute delete

10.1 kB

	import os
	import sys
	import shutil
	import torch
	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.responses import JSONResponse
	from transformers import pipeline
	from google import genai
	from pydub import AudioSegment

	# ==============================================================================
	# I. ASR Setup (Whisper)
	# ==============================================================================
	MODEL_ID = "MohamedRashad/Arabic-Whisper-CodeSwitching-Edition"
	device = 0 if torch.cuda.is_available() else "cpu"
	print(f"Device set to use: {device}")

	try:
	print("Loading ASR pipeline (Whisper)...")
	pipeline_kwargs = {"chunk_length_s": 30}
	asr_pipeline = pipeline(
	"automatic-speech-recognition",
	model=MODEL_ID,
	device=device,
	**pipeline_kwargs
	)
	print("Pipeline loaded successfully.")
	except Exception as e:
	print(f"Error loading ASR pipeline: {e}.")
	asr_pipeline = None

	# ==============================================================================
	# II. Summarization & Correction Setup (Gemini)
	# ==============================================================================
	API_KEY = os.environ.get("GEMINI_API_KEY")
	if not API_KEY:
	print("Warning: GEMINI_API_KEY not set. Summarization will fail.")

	try:
	client = genai.Client(api_key=API_KEY) if API_KEY else None
	except Exception as e:
	print(f"Failed to initialize Gemini Client: {e}")
	client = None

	MODEL_NAME = "gemini-2.5-flash"
	MAX_TOKENS_PER_CHUNK = 10000
	CHUNK_SIZE_LIMIT = int(MAX_TOKENS_PER_CHUNK * 5 * 0.9)

	# --- Helper Functions ---
	def split_text_into_chunks(text: str) -> list[str]:
	"""Splits large text into smaller chunks based on token limits."""
	chunks = []
	current_chunk = ""
	sentences = text.split('.')
	for sentence in sentences:
	if len(current_chunk) + len(sentence) < CHUNK_SIZE_LIMIT:
	current_chunk += sentence + ". "
	else:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence + ". "
	if current_chunk:
	chunks.append(current_chunk.strip())
	return chunks

	def correct_and_format_text(raw_text: str) -> str:
	"""Corrects spelling/grammar and adds punctuation using Gemini."""
	if not raw_text.strip(): return ""
	if client is None: return raw_text

	correction_prompt = f"""
	You are an expert text editor. Your task is to take raw, unpunctuated text, often from a Speech-to-Text (ASR) system, and correct it.

	Perform the following actions:
	1. Fix Spelling and Grammar: Correct all spelling, syntax, and grammatical errors.
	2. Add Punctuation: Insert all necessary punctuation (periods, commas, question marks, etc.) to make the text readable and clear.
	3. Fix Transliterated Terms (CRITICAL): If you find English technical terms written in Arabic letters (e.g., "داتا بيز", "ديب ليرنينج", "نتورك"), you MUST convert them back to their correct English spelling (e.g., "Database", "Deep Learning", "Network") inside the text.
	4. Preserve Content: DO NOT add, delete, or change any core meaning or factual information. Only correct the form.

	CRITICAL INSTRUCTION:
	Output ONLY the corrected text. DO NOT include any conversational filler, introductions, or conclusions such as "Here is the corrected text", "إليك تصحيح النص", or similar phrases. Just return the text itself.

	Raw Text to Correct:
	---
	{raw_text}
	"""
	try:
	response = client.models.generate_content(model=MODEL_NAME, contents=correction_prompt)
	return response.text
	except Exception as e:
	print(f"Correction Error: {e}")
	return raw_text

	def smart_summarize_and_merge(text_to_summarize: str) -> str:
	"""Handles chunking, partial summarization, and final merging."""
	if not text_to_summarize.strip(): return "No text to summarize."
	if client is None: return "Error: Gemini client not initialized."

	# Split text if too long
	if len(text_to_summarize) > CHUNK_SIZE_LIMIT:
	chunks = split_text_into_chunks(text_to_summarize)
	else:
	chunks = [text_to_summarize]

	partial_summaries = []

	# Map Step: Summarize each chunk
	for chunk in chunks:
	partial_prompt = f"""
	You are an expert summarizer. Summarize the following text into clear, key bullet points.
	Do not leave out any essential information. The summary must be in the same language as the source text.

	CRITICAL INSTRUCTION:
	Output ONLY the bullet points. DO NOT include any conversational filler like "Here is the summary" or "إليك التلخيص".

	Source Text:
	---
	{chunk}
	"""
	try:
	response = client.models.generate_content(model=MODEL_NAME, contents=partial_prompt)
	partial_summaries.append(response.text)
	except Exception:
	continue

	if not partial_summaries: return "Failed to generate summary."

	# Reduce Step: Merge summaries
	if len(partial_summaries) > 1:
	combined_summaries = "\n\n--- Previous Chunk Summary ---\n\n".join(partial_summaries)
	input_for_final_prompt = combined_summaries
	prompt_type = "summarize the provided partial summaries"
	else:
	input_for_final_prompt = partial_summaries[0]
	prompt_type = "review and format the following summary"

	final_prompt = f"""
	You are a professional text summarizer. {prompt_type} into clear, comprehensive Bullet Points.
	Use round bullet points (•) for the list items.

	Language Instructions:
	1. If the majority of the input text was in English: The final summary must be strictly in English.
	2. If the majority of the input text was in Arabic (including dialects): The final summary must be in Formal Arabic, while strictly preserving all foreign technical terms (English) exactly as they are without translation.

	CRITICAL INSTRUCTION:
	Output ONLY the final bullet points. DO NOT include any conversational filler, greetings, or phrases like "Here is the summary", "إليك التلخيص النهائي", etc.

	Input:
	---
	{input_for_final_prompt}
	"""
	try:
	response = client.models.generate_content(model=MODEL_NAME, contents=final_prompt)
	return response.text.replace('**', '')
	except Exception as e:
	return f"Summary Error: {e}"

	# ==============================================================================
	# III. API Endpoint Definition
	# ==============================================================================
	app = FastAPI(title="Streaming Transcription & Summarization API")

	@app.post("/process-audio/")
	async def process_audio_api(file: UploadFile = File(...)):
	if asr_pipeline is None:
	raise HTTPException(status_code=500, detail="Error: Transcription model not loaded.")

	print(f"Starting processing for uploaded file: {file.filename}")
	temp_file_path = f"temp_{file.filename}"


	with open(temp_file_path, "wb") as buffer:
	shutil.copyfileobj(file.file, buffer)

	try:
	audio = AudioSegment.from_file(temp_file_path)
	except Exception as e:
	if os.path.exists(temp_file_path):
	os.remove(temp_file_path)
	raise HTTPException(status_code=400, detail=f"Error loading audio file: {e}")


	CHUNK_DURATION_MS = 30 * 1000
	BUFFER_LIMIT_MS = 5 * 60 * 1000
	total_duration = len(audio)

	accumulated_corrected_text = ""
	buffer_text = ""

	# ---------------------------------------------------------
	# Phase 1: Processing Loop (Whisper ASR + 5-Min Gemini Buffer)
	# ---------------------------------------------------------
	for i in range(0, total_duration, CHUNK_DURATION_MS):
	chunk_num = (i // CHUNK_DURATION_MS) + 1

	# 1. Prepare Chunk for Whisper (30s)
	chunk_audio = audio[i : i + CHUNK_DURATION_MS]
	chunk_filename = f"temp_chunk_{chunk_num}.wav"
	chunk_audio.export(chunk_filename, format="wav")

	# 2. ASR (Speech-to-Text)
	try:
	asr_result = asr_pipeline(chunk_filename, return_timestamps=True)
	raw_text = asr_result['text'].strip()
	if raw_text:
	buffer_text += raw_text + " " # نضيف النص لـ البافر
	except Exception as e:
	print(f"Error in ASR for chunk {chunk_num}: {e}")

	# Cleanup temp file
	if os.path.exists(chunk_filename):
	os.remove(chunk_filename)

	# 3. Check Buffer (Is it 5 mins yet? Or is it the last chunk?)
	current_position_ms = i + len(chunk_audio)
	if (current_position_ms % BUFFER_LIMIT_MS == 0) or (current_position_ms >= total_duration):
	if buffer_text.strip():
	print(f"Buffer reached {current_position_ms/60000:.2f} mins. Sending to Gemini for correction...")
	corrected_chunk = correct_and_format_text(buffer_text)
	accumulated_corrected_text += corrected_chunk.strip() + "\n\n"
	buffer_text = ""

	# ---------------------------------------------------------
	# Phase 2: Final Summarization
	# ---------------------------------------------------------
	print("All chunks processed. Starting summarization...")

	if accumulated_corrected_text and len(accumulated_corrected_text.strip()) > 50:
	final_summary = smart_summarize_and_merge(accumulated_corrected_text)
	else:
	final_summary = "Text too short to summarize."

	if os.path.exists(temp_file_path):
	os.remove(temp_file_path)

	return JSONResponse(content={
	"transcript": accumulated_corrected_text.strip(),
	"summary": final_summary.strip()
	})