transcriptinator_v2 / ai_providers.py
emmajeed's picture
Upload 5 files
7ee2bc7 verified
"""
AI Provider Abstraction Layer for Transcriptinator
Supports multiple AI providers: Gemini and HuggingFace
"""
from abc import ABC, abstractmethod
from typing import Dict, List
import google.generativeai as genai
import requests
class TranscriptionProvider(ABC):
"""Base class for AI transcription providers"""
@abstractmethod
def transcribe(self, audio_file_path: str) -> str:
"""Generate transcription from audio file"""
pass
@abstractmethod
def generate_summary(self, text: str) -> str:
"""Generate summary from transcription text"""
pass
@abstractmethod
def generate_key_ideas(self, text: str) -> List[Dict[str, str]]:
"""Extract key ideas from transcription text"""
pass
class GeminiProvider(TranscriptionProvider):
"""Google Gemini provider with configurable models"""
AVAILABLE_MODELS = {
"Gemini 2.5 Flash": "models/gemini-2.5-flash",
"Gemini 2.0 Flash": "models/gemini-2.0-flash-exp",
"Gemini 1.5 Flash": "models/gemini-1.5-flash"
}
def __init__(self, api_key: str, model_name: str):
self.api_key = api_key
self.model_name = model_name
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel(self.AVAILABLE_MODELS[model_name])
def transcribe(self, audio_file_path: str) -> str:
"""Generate transcription using Gemini API with timestamps and speakers"""
try:
with open(audio_file_path, "rb") as audio_file:
audio_data = audio_file.read()
contents = [
{
"role": "user",
"parts": [
{
"mime_type": "audio/mp3",
"data": audio_data
},
"Create a clean transcription of the audio file in English. Tag timestamps and speakers separately within the transcription. If speakers can be identified, use their names; otherwise, use 'Speaker 1', 'Speaker 2', etc. **Return ONLY the raw transcription text, starting directly with the first line of the transcription.** Do not include any introductory phrases, speaker identification plans, completion messages, or any text other than the transcription itself."
]
},
{
"role": "model",
"parts": [
"Understood. I will provide a clean, timestamped, and speaker-tagged transcription of the audio file, returning only the transcription text as requested."
]
}
]
response = self.model.generate_content(contents)
return response.text
except Exception as e:
raise Exception(f"Error during Gemini transcription: {e}")
def generate_summary(self, text: str) -> str:
"""Generate a concise 2-3 sentence summary using Gemini"""
try:
prompt_text = f"""
Please read the following transcription text and write a concise summary of the main points in 2-3 sentences.
Transcription Text:
{text}
Summary:
"""
response = self.model.generate_content(prompt_text)
return response.text.strip()
except Exception as e:
return f"Error generating summary: {e}"
def generate_key_ideas(self, text: str) -> List[Dict[str, str]]:
"""Identify 3-5 key ideas from the transcription using Gemini"""
try:
prompt_text = f"""
Please read the following transcription text and identify 3-5 key ideas or concepts discussed.
Return these key ideas as a bulleted list, with each item in the list being an idea followed by a short (1-sentence) description of the idea.
Transcription Text:
{text}
Key Ideas:
"""
response = self.model.generate_content(prompt_text)
key_ideas_text = response.text.strip()
key_ideas_list = []
for item in key_ideas_text.split('\n'):
item = item.lstrip('-* ')
if item:
parts = item.split(':', 1)
if len(parts) == 2:
idea = parts[0].strip()
description = parts[1].strip()
key_ideas_list.append({'idea': idea, 'description': description})
else:
key_ideas_list.append({'idea': item.strip(), 'description': ''})
return key_ideas_list
except Exception as e:
return [{'idea': 'Error generating key ideas', 'description': str(e)}]
class OpenRouterProvider(TranscriptionProvider):
"""OpenRouter API provider for text generation (summary/key ideas)"""
# Using DeepSeek R1 - excellent free model for reasoning and text generation
MODEL_ID = "deepseek/deepseek-r1-0528:free"
API_URL = "https://openrouter.ai/api/v1/chat/completions"
def __init__(self, api_key: str, model_name: str = None):
# model_name is ignored for OpenRouter since we use fixed DeepSeek R1
self.api_key = api_key
def transcribe(self, audio_file_path: str) -> str:
"""Not supported - OpenRouter doesn't handle audio"""
raise NotImplementedError("OpenRouter doesn't support audio transcription. Use Gemini provider.")
def generate_summary(self, text: str) -> str:
"""Generate summary using OpenRouter DeepSeek R1"""
try:
# Truncate text if too long
max_chars = 8000
text_to_summarize = text[:max_chars] if len(text) > max_chars else text
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.MODEL_ID,
"messages": [
{
"role": "user",
"content": f"Please provide a concise 2-3 sentence summary of the following transcription:\n\n{text_to_summarize}"
}
]
}
response = requests.post(self.API_URL, headers=headers, json=payload)
# Handle errors
if response.status_code != 200:
return f"Summary unavailable: OpenRouter API error (status {response.status_code})"
result = response.json()
# Extract the response
if "choices" in result and len(result["choices"]) > 0:
return result["choices"][0]["message"]["content"].strip()
return "Summary generation completed but format unexpected."
except Exception as e:
return f"Error generating summary: {e}"
def generate_key_ideas(self, text: str) -> List[Dict[str, str]]:
"""Generate key ideas using OpenRouter DeepSeek R1"""
try:
# Truncate text if too long
max_chars = 6000
text_to_analyze = text[:max_chars] if len(text) > max_chars else text
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.MODEL_ID,
"messages": [
{
"role": "user",
"content": f"""Extract 3-5 key ideas from this transcription. Format each as:
Idea: Brief title
Description: One sentence explanation
{text_to_analyze}"""
}
]
}
response = requests.post(self.API_URL, headers=headers, json=payload)
if response.status_code != 200:
return [{'idea': 'Key ideas unavailable', 'description': f'OpenRouter API error (status {response.status_code})'}]
result = response.json()
# Extract and parse the response
if "choices" in result and len(result["choices"]) > 0:
content = result["choices"][0]["message"]["content"]
# Parse the response into structured key ideas
key_ideas_list = []
lines = content.split('\n')
current_idea = None
for line in lines:
line = line.strip()
if line.startswith(("Idea:", "**Idea:")):
if current_idea:
key_ideas_list.append(current_idea)
idea_text = line.replace("Idea:", "").replace("**", "").strip()
current_idea = {'idea': idea_text, 'description': ''}
elif line.startswith(("Description:", "**Description:")) and current_idea:
desc_text = line.replace("Description:", "").replace("**", "").strip()
current_idea['description'] = desc_text
elif ':' in line and not current_idea:
# Fallback parsing
parts = line.split(':', 1)
if len(parts) == 2:
key_ideas_list.append({
'idea': parts[0].strip('- •*123456789.').strip(),
'description': parts[1].strip()
})
# Add last idea if exists
if current_idea and current_idea['idea']:
key_ideas_list.append(current_idea)
# Fallback if parsing fails
if not key_ideas_list:
# Just use first few sentences
sentences = [s.strip() for s in content.split('.') if s.strip()][:5]
for i, sent in enumerate(sentences, 1):
if sent:
key_ideas_list.append({'idea': f'Key Point {i}', 'description': sent})
return key_ideas_list[:5]
return [{'idea': 'Key ideas extraction', 'description': 'Unable to parse response'}]
except Exception as e:
return [{'idea': 'Error generating key ideas', 'description': str(e)}]
def get_provider(provider_type: str, api_key: str, model_name: str) -> TranscriptionProvider:
"""Factory function to create appropriate provider"""
if provider_type == "Gemini":
return GeminiProvider(api_key, model_name)
elif provider_type == "OpenRouter":
return OpenRouterProvider(api_key, model_name)
else:
raise ValueError(f"Unknown provider: {provider_type}")