Spaces:

emmajeed
/

transcriptinator_v2

Build error

File size: 11,185 Bytes

7ee2bc7

"""
AI Provider Abstraction Layer for Transcriptinator
Supports multiple AI providers: Gemini and HuggingFace
"""

from abc import ABC, abstractmethod
from typing import Dict, List
import google.generativeai as genai
import requests


class TranscriptionProvider(ABC):
    """Base class for AI transcription providers"""
    
    @abstractmethod
    def transcribe(self, audio_file_path: str) -> str:
        """Generate transcription from audio file"""
        pass
    
    @abstractmethod
    def generate_summary(self, text: str) -> str:
        """Generate summary from transcription text"""
        pass
    
    @abstractmethod
    def generate_key_ideas(self, text: str) -> List[Dict[str, str]]:
        """Extract key ideas from transcription text"""
        pass


class GeminiProvider(TranscriptionProvider):
    """Google Gemini provider with configurable models"""
    
    AVAILABLE_MODELS = {
        "Gemini 2.5 Flash": "models/gemini-2.5-flash",
        "Gemini 2.0 Flash": "models/gemini-2.0-flash-exp",
        "Gemini 1.5 Flash": "models/gemini-1.5-flash"
    }
    
    def __init__(self, api_key: str, model_name: str):
        self.api_key = api_key
        self.model_name = model_name
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel(self.AVAILABLE_MODELS[model_name])
    
    def transcribe(self, audio_file_path: str) -> str:
        """Generate transcription using Gemini API with timestamps and speakers"""
        try:
            with open(audio_file_path, "rb") as audio_file:
                audio_data = audio_file.read()
                
            contents = [
                {
                    "role": "user",
                    "parts": [
                        {
                            "mime_type": "audio/mp3",
                            "data": audio_data
                        },
                        "Create a clean transcription of the audio file in English. Tag timestamps and speakers separately within the transcription. If speakers can be identified, use their names; otherwise, use 'Speaker 1', 'Speaker 2', etc. **Return ONLY the raw transcription text, starting directly with the first line of the transcription.** Do not include any introductory phrases, speaker identification plans, completion messages, or any text other than the transcription itself."
                    ]
                },
                {
                    "role": "model",
                    "parts": [
                        "Understood. I will provide a clean, timestamped, and speaker-tagged transcription of the audio file, returning only the transcription text as requested."
                    ]
                }
            ]
            
            response = self.model.generate_content(contents)
            return response.text
            
        except Exception as e:
            raise Exception(f"Error during Gemini transcription: {e}")
    
    def generate_summary(self, text: str) -> str:
        """Generate a concise 2-3 sentence summary using Gemini"""
        try:
            prompt_text = f"""
            Please read the following transcription text and write a concise summary of the main points in 2-3 sentences.

            Transcription Text:
            {text}

            Summary:
            """
            
            response = self.model.generate_content(prompt_text)
            return response.text.strip()
            
        except Exception as e:
            return f"Error generating summary: {e}"
    
    def generate_key_ideas(self, text: str) -> List[Dict[str, str]]:
        """Identify 3-5 key ideas from the transcription using Gemini"""
        try:
            prompt_text = f"""
            Please read the following transcription text and identify 3-5 key ideas or concepts discussed.
            Return these key ideas as a bulleted list, with each item in the list being an idea followed by a short (1-sentence) description of the idea.

            Transcription Text:
            {text}

            Key Ideas:
            """
            
            response = self.model.generate_content(prompt_text)
            key_ideas_text = response.text.strip()
            
            key_ideas_list = []
            for item in key_ideas_text.split('\n'):
                item = item.lstrip('-* ')
                if item:
                    parts = item.split(':', 1)
                    if len(parts) == 2:
                        idea = parts[0].strip()
                        description = parts[1].strip()
                        key_ideas_list.append({'idea': idea, 'description': description})
                    else:
                        key_ideas_list.append({'idea': item.strip(), 'description': ''})
                        
            return key_ideas_list
            
        except Exception as e:
            return [{'idea': 'Error generating key ideas', 'description': str(e)}]


class OpenRouterProvider(TranscriptionProvider):
    """OpenRouter API provider for text generation (summary/key ideas)"""
    
    # Using DeepSeek R1 - excellent free model for reasoning and text generation
    MODEL_ID = "deepseek/deepseek-r1-0528:free"
    API_URL = "https://openrouter.ai/api/v1/chat/completions"
    
    def __init__(self, api_key: str, model_name: str = None):
        # model_name is ignored for OpenRouter since we use fixed DeepSeek R1
        self.api_key = api_key
    
    def transcribe(self, audio_file_path: str) -> str:
        """Not supported - OpenRouter doesn't handle audio"""
        raise NotImplementedError("OpenRouter doesn't support audio transcription. Use Gemini provider.")
    
    def generate_summary(self, text: str) -> str:
        """Generate summary using OpenRouter DeepSeek R1"""
        try:
            # Truncate text if too long
            max_chars = 8000
            text_to_summarize = text[:max_chars] if len(text) > max_chars else text
            
            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }
            
            payload = {
                "model": self.MODEL_ID,
                "messages": [
                    {
                        "role": "user",
                        "content": f"Please provide a concise 2-3 sentence summary of the following transcription:\n\n{text_to_summarize}"
                    }
                ]
            }
            
            response = requests.post(self.API_URL, headers=headers, json=payload)
            
            # Handle errors
            if response.status_code != 200:
                return f"Summary unavailable: OpenRouter API error (status {response.status_code})"
            
            result = response.json()
            
            # Extract the response
            if "choices" in result and len(result["choices"]) > 0:
                return result["choices"][0]["message"]["content"].strip()
            
            return "Summary generation completed but format unexpected."
            
        except Exception as e:
            return f"Error generating summary: {e}"
    
    def generate_key_ideas(self, text: str) -> List[Dict[str, str]]:
        """Generate key ideas using OpenRouter DeepSeek R1"""
        try:
            # Truncate text if too long
            max_chars = 6000
            text_to_analyze = text[:max_chars] if len(text) > max_chars else text
            
            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }
            
            payload = {
                "model": self.MODEL_ID,
                "messages": [
                    {
                        "role": "user",
                        "content": f"""Extract 3-5 key ideas from this transcription. Format each as:
Idea: Brief title
Description: One sentence explanation

{text_to_analyze}"""
                    }
                ]
            }
            
            response = requests.post(self.API_URL, headers=headers, json=payload)
            
            if response.status_code != 200:
                return [{'idea': 'Key ideas unavailable', 'description': f'OpenRouter API error (status {response.status_code})'}]
            
            result = response.json()
            
            # Extract and parse the response
            if "choices" in result and len(result["choices"]) > 0:
                content = result["choices"][0]["message"]["content"]
                
                # Parse the response into structured key ideas
                key_ideas_list = []
                lines = content.split('\n')
                
                current_idea = None
                for line in lines:
                    line = line.strip()
                    if line.startswith(("Idea:", "**Idea:")):
                        if current_idea:
                            key_ideas_list.append(current_idea)
                        idea_text = line.replace("Idea:", "").replace("**", "").strip()
                        current_idea = {'idea': idea_text, 'description': ''}
                    elif line.startswith(("Description:", "**Description:")) and current_idea:
                        desc_text = line.replace("Description:", "").replace("**", "").strip()
                        current_idea['description'] = desc_text
                    elif ':' in line and not current_idea:
                        # Fallback parsing
                        parts = line.split(':', 1)
                        if len(parts) == 2:
                            key_ideas_list.append({
                                'idea': parts[0].strip('- •*123456789.').strip(),
                                'description': parts[1].strip()
                            })
                
                # Add last idea if exists
                if current_idea and current_idea['idea']:
                    key_ideas_list.append(current_idea)
                
                # Fallback if parsing fails
                if not key_ideas_list:
                    # Just use first few sentences
                    sentences = [s.strip() for s in content.split('.') if s.strip()][:5]
                    for i, sent in enumerate(sentences, 1):
                        if sent:
                            key_ideas_list.append({'idea': f'Key Point {i}', 'description': sent})
                
                return key_ideas_list[:5]
            
            return [{'idea': 'Key ideas extraction', 'description': 'Unable to parse response'}]
            
        except Exception as e:
            return [{'idea': 'Error generating key ideas', 'description': str(e)}]


def get_provider(provider_type: str, api_key: str, model_name: str) -> TranscriptionProvider:
    """Factory function to create appropriate provider"""
    if provider_type == "Gemini":
        return GeminiProvider(api_key, model_name)
    elif provider_type == "OpenRouter":
        return OpenRouterProvider(api_key, model_name)
    else:
        raise ValueError(f"Unknown provider: {provider_type}")