Spaces:

2begyb
/

HackingFactory-v2

Sleeping

App Files Files Community

HackingFactory-v2 / server /_core /voiceTranscription.ts

FECUOY

Initial commit: HackingFactory v2 Enhanced with Self-Refining AI features

4c41b3d 3 months ago

raw

history blame contribute delete

8.11 kB

	/**
	* Voice transcription helper using internal Speech-to-Text service
	*
	* Frontend implementation guide:
	* 1. Capture audio using MediaRecorder API
	* 2. Upload audio to storage (e.g., S3) to get URL
	* 3. Call transcription with the URL
	*
	* Example usage:
	* ```tsx
	* // Frontend component
	* const transcribeMutation = trpc.voice.transcribe.useMutation({
	* onSuccess: (data) => {
	* console.log(data.text); // Full transcription
	* console.log(data.language); // Detected language
	* console.log(data.segments); // Timestamped segments
	* }
	* });
	*
	* // After uploading audio to storage
	* transcribeMutation.mutate({
	* audioUrl: uploadedAudioUrl,
	* language: 'en', // optional
	* prompt: 'Transcribe the meeting' // optional
	* });
	* ```
	*/
	import { ENV } from "./env";

	export type TranscribeOptions = {
	audioUrl: string; // URL to the audio file (e.g., S3 URL)
	language?: string; // Optional: specify language code (e.g., "en", "es", "zh")
	prompt?: string; // Optional: custom prompt for the transcription
	};

	// Native Whisper API segment format
	export type WhisperSegment = {
	id: number;
	seek: number;
	start: number;
	end: number;
	text: string;
	tokens: number[];
	temperature: number;
	avg_logprob: number;
	compression_ratio: number;
	no_speech_prob: number;
	};

	// Native Whisper API response format
	export type WhisperResponse = {
	task: "transcribe";
	language: string;
	duration: number;
	text: string;
	segments: WhisperSegment[];
	};

	export type TranscriptionResponse = WhisperResponse; // Return native Whisper API response directly

	export type TranscriptionError = {
	error: string;
	code: "FILE_TOO_LARGE" \| "INVALID_FORMAT" \| "TRANSCRIPTION_FAILED" \| "UPLOAD_FAILED" \| "SERVICE_ERROR";
	details?: string;
	};

	/**
	* Transcribe audio to text using the internal Speech-to-Text service
	*
	* @param options - Audio data and metadata
	* @returns Transcription result or error
	*/
	export async function transcribeAudio(
	options: TranscribeOptions
	): Promise<TranscriptionResponse \| TranscriptionError> {
	try {
	// Step 1: Validate environment configuration
	if (!ENV.forgeApiUrl) {
	return {
	error: "Voice transcription service is not configured",
	code: "SERVICE_ERROR",
	details: "BUILT_IN_FORGE_API_URL is not set"
	};
	}
	if (!ENV.forgeApiKey) {
	return {
	error: "Voice transcription service authentication is missing",
	code: "SERVICE_ERROR",
	details: "BUILT_IN_FORGE_API_KEY is not set"
	};
	}

	// Step 2: Download audio from URL
	let audioBuffer: Buffer;
	let mimeType: string;
	try {
	const response = await fetch(options.audioUrl);
	if (!response.ok) {
	return {
	error: "Failed to download audio file",
	code: "INVALID_FORMAT",
	details: `HTTP ${response.status}: ${response.statusText}`
	};
	}

	audioBuffer = Buffer.from(await response.arrayBuffer());
	mimeType = response.headers.get('content-type') \|\| 'audio/mpeg';

	// Check file size (16MB limit)
	const sizeMB = audioBuffer.length / (1024 * 1024);
	if (sizeMB > 16) {
	return {
	error: "Audio file exceeds maximum size limit",
	code: "FILE_TOO_LARGE",
	details: `File size is ${sizeMB.toFixed(2)}MB, maximum allowed is 16MB`
	};
	}
	} catch (error) {
	return {
	error: "Failed to fetch audio file",
	code: "SERVICE_ERROR",
	details: error instanceof Error ? error.message : "Unknown error"
	};
	}

	// Step 3: Create FormData for multipart upload to Whisper API
	const formData = new FormData();

	// Create a Blob from the buffer and append to form
	const filename = `audio.${getFileExtension(mimeType)}`;
	const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
	formData.append("file", audioBlob, filename);

	formData.append("model", "whisper-1");
	formData.append("response_format", "verbose_json");

	// Add prompt - use custom prompt if provided, otherwise generate based on language
	const prompt = options.prompt \|\| (
	options.language
	? `Transcribe the user's voice to text, the user's working language is ${getLanguageName(options.language)}`
	: "Transcribe the user's voice to text"
	);
	formData.append("prompt", prompt);

	// Step 4: Call the transcription service
	const baseUrl = ENV.forgeApiUrl.endsWith("/")
	? ENV.forgeApiUrl
	: `${ENV.forgeApiUrl}/`;

	const fullUrl = new URL(
	"v1/audio/transcriptions",
	baseUrl
	).toString();

	const response = await fetch(fullUrl, {
	method: "POST",
	headers: {
	authorization: `Bearer ${ENV.forgeApiKey}`,
	"Accept-Encoding": "identity",
	},
	body: formData,
	});

	if (!response.ok) {
	const errorText = await response.text().catch(() => "");
	return {
	error: "Transcription service request failed",
	code: "TRANSCRIPTION_FAILED",
	details: `${response.status} ${response.statusText}${errorText ? `: ${errorText}` : ""}`
	};
	}

	// Step 5: Parse and return the transcription result
	const whisperResponse = await response.json() as WhisperResponse;

	// Validate response structure
	if (!whisperResponse.text \|\| typeof whisperResponse.text !== 'string') {
	return {
	error: "Invalid transcription response",
	code: "SERVICE_ERROR",
	details: "Transcription service returned an invalid response format"
	};
	}

	return whisperResponse; // Return native Whisper API response directly

	} catch (error) {
	// Handle unexpected errors
	return {
	error: "Voice transcription failed",
	code: "SERVICE_ERROR",
	details: error instanceof Error ? error.message : "An unexpected error occurred"
	};
	}
	}

	/**
	* Helper function to get file extension from MIME type
	*/
	function getFileExtension(mimeType: string): string {
	const mimeToExt: Record<string, string> = {
	'audio/webm': 'webm',
	'audio/mp3': 'mp3',
	'audio/mpeg': 'mp3',
	'audio/wav': 'wav',
	'audio/wave': 'wav',
	'audio/ogg': 'ogg',
	'audio/m4a': 'm4a',
	'audio/mp4': 'm4a',
	};

	return mimeToExt[mimeType] \|\| 'audio';
	}

	/**
	* Helper function to get full language name from ISO code
	*/
	function getLanguageName(langCode: string): string {
	const langMap: Record<string, string> = {
	'en': 'English',
	'es': 'Spanish',
	'fr': 'French',
	'de': 'German',
	'it': 'Italian',
	'pt': 'Portuguese',
	'ru': 'Russian',
	'ja': 'Japanese',
	'ko': 'Korean',
	'zh': 'Chinese',
	'ar': 'Arabic',
	'hi': 'Hindi',
	'nl': 'Dutch',
	'pl': 'Polish',
	'tr': 'Turkish',
	'sv': 'Swedish',
	'da': 'Danish',
	'no': 'Norwegian',
	'fi': 'Finnish',
	};

	return langMap[langCode] \|\| langCode;
	}

	/**
	* Example tRPC procedure implementation:
	*
	* ```ts
	* // In server/routers.ts
	* import { transcribeAudio } from "./_core/voiceTranscription";
	*
	* export const voiceRouter = router({
	* transcribe: protectedProcedure
	* .input(z.object({
	* audioUrl: z.string(),
	* language: z.string().optional(),
	* prompt: z.string().optional(),
	* }))
	* .mutation(async ({ input, ctx }) => {
	* const result = await transcribeAudio(input);
	*
	* // Check if it's an error
	* if ('error' in result) {
	* throw new TRPCError({
	* code: 'BAD_REQUEST',
	* message: result.error,
	* cause: result,
	* });
	* }
	*
	* // Optionally save transcription to database
	* await db.insert(transcriptions).values({
	* userId: ctx.user.id,
	* text: result.text,
	* duration: result.duration,
	* language: result.language,
	* audioUrl: input.audioUrl,
	* createdAt: new Date(),
	* });
	*
	* return result;
	* }),
	* });
	* ```
	*/