Spaces:

omgy
/

ai_call

Sleeping

App Files Files Community

ai_call / server.py

omgy

Update server.py

f03a824 verified about 1 month ago

raw

history blame contribute delete

15.5 kB

	import os
	import json
	import asyncio
	import re
	from pathlib import Path
	from datetime import datetime
	from fastapi import FastAPI, UploadFile, File, Form
	from fastapi.responses import JSONResponse, FileResponse
	import edge_tts
	from groq import Groq
	from google import genai

	app = FastAPI(title="Voice Call AI Bridge Backend")

	BASE_DIR = Path(__file__).resolve().parent
	DATA_DIR = BASE_DIR / "data"
	RECORDINGS_DIR = DATA_DIR / "recordings"
	RESPONSES_DIR = DATA_DIR / "responses"
	LOGS_DIR = DATA_DIR / "logs"

	for d in (RECORDINGS_DIR, RESPONSES_DIR, LOGS_DIR):
	d.mkdir(parents=True, exist_ok=True)

	# ================================================================
	# DYNAMIC API KEYS (unlimited)
	# ================================================================
	GROQ_API_KEYS = []
	i = 1
	while True:
	key = os.getenv(f"GROQ_API_KEY_{i}")
	if not key:
	# also try plain GROQ_API_KEY for backward compat
	key = os.getenv("GROQ_API_KEY") if i == 1 else None
	if not key:
	break
	GROQ_API_KEYS.append(key)
	i += 1

	GEMINI_API_KEYS = []
	i = 1
	while True:
	key = os.getenv(f"GEMINI_API_KEY_{i}")
	if not key:
	break
	GEMINI_API_KEYS.append(key)
	i += 1

	GROQ_CHAT_MODEL = os.getenv("GROQ_CHAT_MODEL", "llama-3.3-70b-versatile")
	GROQ_STT_MODEL = "whisper-large-v3" # large-v3 has better Indian language support than turbo
	GEMINI_MODEL = "gemini-2.0-flash"

	# Whisper language codes — telling Whisper the language upfront fixes accuracy dramatically
	WHISPER_LANG_CODES = {
	"gujarati": "gu",
	"hindi": "hi",
	"english": "en",
	}

	# App sends language hint from user's phone locale/preference
	# Fallback: transcribe twice (once with gu, once auto) and pick longer result

	print(f"Loaded {len(GROQ_API_KEYS)} Groq key(s), {len(GEMINI_API_KEYS)} Gemini key(s).")

	# ================================================================
	# MICROSOFT EDGE NEURAL VOICES (free, no API key, excellent quality)
	# ================================================================
	EDGE_VOICES = {
	"English": "en-IN-NeerjaNeural", # Indian English, natural
	"Hindi": "hi-IN-SwaraNeural", # Hindi female, very smooth
	"Gujarati": "gu-IN-DhwaniNeural", # Gujarati female, native quality
	}

	# ================================================================
	# LANGUAGE DETECTION
	# ================================================================
	def detect_language(text: str) -> str:
	for char in text:
	if '\u0A80' <= char <= '\u0AFF':
	return "Gujarati"
	if '\u0900' <= char <= '\u097F':
	return "Hindi"
	return "English"

	# ================================================================
	# STT: GROQ WHISPER (tries all keys)
	# ================================================================
	def transcribe_audio(path: Path, language_hint: str = None) -> str:
	"""
	Transcribe audio with Whisper.
	language_hint: "gujarati", "hindi", or "english" — dramatically improves accuracy.
	If not provided, tries Gujarati + Hindi + auto and picks the best result.
	"""
	if not GROQ_API_KEYS:
	return "(transcription skipped: no GROQ_API_KEY configured)"

	def _transcribe_with_lang(key: str, lang_code: str = None) -> str:
	client = Groq(api_key=key)
	with path.open("rb") as audio:
	kwargs = dict(
	model=GROQ_STT_MODEL,
	file=audio,
	response_format="verbose_json",
	)
	if lang_code:
	kwargs["language"] = lang_code # explicit language = much better accuracy
	result = client.audio.transcriptions.create(**kwargs)
	text = getattr(result, "text", None) or (result.get("text", "") if isinstance(result, dict) else "")
	return text.strip()

	for index, key in enumerate(GROQ_API_KEYS):
	try:
	if language_hint:
	# User told us the language — use it directly
	lang_code = WHISPER_LANG_CODES.get(language_hint.lower())
	text = _transcribe_with_lang(key, lang_code)
	print(f"[STT] Key #{index+1} ({language_hint}): {text[:60]}")
	return text

	else:
	# No hint — try Gujarati, Hindi, and auto; pick longest meaningful result
	results = {}
	for lang_name, lang_code in WHISPER_LANG_CODES.items():
	try:
	t = _transcribe_with_lang(key, lang_code)
	if t and t not in [".", "", " "]:
	results[lang_name] = t
	print(f"[STT] {lang_name} attempt: {t[:50]}")
	except Exception:
	pass

	if not results:
	continue

	def has_gujarati_script(t):
	return any('઀' <= c <= '૿' for c in t)

	def has_hindi_script(t):
	return any('ऀ' <= c <= 'ॿ' for c in t)

	def is_transliterated_english(gujarati_text, english_text):
	"""
	Detect if Whisper just wrote English words in Gujarati script.
	Strategy: count how many English words appear phonetically in Gujarati text.
	Common English loanwords in Gujarati script are a giveaway.
	Also: if English result has meaningful words and Gujarati has same word count,
	it's likely transliteration.
	"""
	# Common English words that Whisper writes in Gujarati script when confused
	english_in_gujarati_markers = [
	'એન', 'ધ', 'ઈન', 'ઈઝ', 'ઓફ', 'ટો', 'એ', 'કેન', 'યુ',
	'આઈ', 'વી', 'ઓહ', 'હાઈ', 'ઓકે', 'યસ', 'નો', 'હેલો',
	'ટોક', 'સ્પીક', 'ઈટ', 'માય', 'યોર', 'ઈઝ'
	]
	marker_count = sum(1 for m in english_in_gujarati_markers if m in gujarati_text)
	# If 2+ English markers found in Gujarati text, it's transliteration
	if marker_count >= 2:
	return True
	# If english result is meaningful and gujarati word count matches english
	guj_words = len(gujarati_text.split())
	eng_words = len(english_text.split())
	if english_text and abs(guj_words - eng_words) <= 1 and eng_words > 1:
	# Same number of words = same sentence just transliterated
	return True
	return False

	gu_text = results.get("gujarati", "")
	hi_text = results.get("hindi", "")
	en_text = results.get("english", "")

	if gu_text and has_gujarati_script(gu_text) and not is_transliterated_english(gu_text, en_text):
	best = gu_text
	print(f"[STT] Real Gujarati detected: {best[:60]}")
	elif hi_text and has_hindi_script(hi_text):
	best = hi_text
	print(f"[STT] Real Hindi detected: {best[:60]}")
	else:
	best = en_text or gu_text or list(results.values())[0]
	print(f"[STT] English/fallback selected: {best[:60]}")

	return best

	except Exception as e:
	print(f"[STT] Key #{index+1} failed: {e}")
	continue

	return ""

	# ================================================================
	# AI: GROQ → GEMINI FALLBACK
	# ================================================================
	def build_prompt(user_text: str, language: str) -> str:
	return (
	f"You are a helpful voice assistant for rural villagers in India.\n"
	f"You help with: farming tips, crop care, weather advice, government schemes, general questions.\n"
	f"STRICT RULES:\n"
	f"- Detect and reply ONLY in {language}. Use correct script.\n"
	f"- Reply must be SHORT — this is a PHONE CALL. Maximum 2-3 sentences.\n"
	f"- NO bullet points, NO lists, NO markdown. Speak naturally.\n"
	f"- Be warm, simple, and clear for a rural farmer.\n"
	f"\nUser said: {user_text}\n"
	f"Your spoken reply ({language}, 2-3 sentences max):"
	)

	def try_groq_chat(prompt: str) -> str \| None:
	for index, key in enumerate(GROQ_API_KEYS):
	try:
	client = Groq(api_key=key)
	resp = client.chat.completions.create(
	model=GROQ_CHAT_MODEL,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=120,
	temperature=0.4,
	)
	result = resp.choices[0].message.content.strip()
	print(f"[AI/Groq] Key #{index+1} success")
	return result
	except Exception as e:
	if "429" in str(e) or "rate" in str(e).lower():
	print(f"[AI/Groq] Key #{index+1} rate limited, trying next...")
	else:
	print(f"[AI/Groq] Key #{index+1} error: {e}")
	continue
	return None

	def try_gemini_chat(prompt: str) -> str \| None:
	for index, key in enumerate(GEMINI_API_KEYS):
	try:
	client = genai.Client(api_key=key)
	resp = client.models.generate_content(model=GEMINI_MODEL, contents=prompt)
	print(f"[AI/Gemini] Key #{index+1} success")
	return resp.text.strip()
	except Exception as e:
	if "429" in str(e) or "quota" in str(e).lower():
	print(f"[AI/Gemini] Key #{index+1} quota exceeded, trying next...")
	else:
	print(f"[AI/Gemini] Key #{index+1} error: {e}")
	continue
	return None

	def generate_reply(user_text: str, language: str) -> str:
	prompt = build_prompt(user_text, language)
	result = try_groq_chat(prompt) or try_gemini_chat(prompt)

	if not result:
	return {
	"English": "Sorry, I could not process your request. Please try again.",
	"Hindi": "माफ करें, अभी जवाब देने में असमर्थ हूँ। कृपया दोबारा कोशिश करें।",
	"Gujarati": "માફ કરશો, હું હમણાં જવાબ આપી શકતો નથી. ફરી પ્રયાસ કરો."
	}.get(language, "Sorry, unable to respond now.")

	# Clean for speech
	result = re.sub(r"[*_`#]", "", result)
	result = result.replace("\n", " ").strip()
	return result

	# ================================================================
	# TTS: MICROSOFT EDGE NEURAL (free, excellent Gujarati/Hindi)
	# ================================================================
	async def synthesize_tts(text: str, language: str, output_path: Path) -> None:
	"""Async TTS — must be awaited. Works correctly inside FastAPI/uvicorn."""
	voice = EDGE_VOICES.get(language, EDGE_VOICES["English"])
	print(f"[TTS] Using voice: {voice} for {language}")
	communicate = edge_tts.Communicate(text, voice)
	await communicate.save(str(output_path))

	# ================================================================
	# HELPERS
	# ================================================================
	def now_id() -> str:
	return datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")

	# ================================================================
	# ENDPOINTS
	# ================================================================
	@app.get("/health")
	def health():
	return {
	"ok": True,
	"groq_keys": len(GROQ_API_KEYS),
	"gemini_keys": len(GEMINI_API_KEYS),
	"tts_engine": "Microsoft Edge Neural TTS",
	"voices": EDGE_VOICES,
	"languages": ["English", "Hindi", "Gujarati"]
	}


	@app.post("/calls/upload")
	async def upload_call(
	phone: str = Form(default=""),
	device_id: str = Form(default="android"),
	meta: str = Form(default="{}"),
	language_hint: str = Form(default=""), # "gujarati", "hindi", "english" or ""
	audio_file: UploadFile = File(...),
	):
	"""
	React Native app sends:
	- audio_file: the recorded call audio (m4a/wav/mp3)
	- phone: caller's number
	- device_id: your gateway device ID
	- meta: any extra JSON metadata

	Returns:
	- transcript: what user said
	- reply_text: AI response
	- reply_audio_url: URL to fetch the MP3 voice response
	- language: detected language
	"""
	call_id = now_id()

	# Save uploaded audio
	ext = Path(audio_file.filename or "call.m4a").suffix or ".m4a"
	raw_audio_path = RECORDINGS_DIR / f"{call_id}{ext}"
	raw_audio_path.write_bytes(await audio_file.read())
	print(f"[{call_id}] Audio saved: {raw_audio_path}")

	# Step 1: Transcribe with Groq Whisper
	transcript = transcribe_audio(raw_audio_path, language_hint=language_hint or None)
	print(f"[{call_id}] Transcript: {transcript}")

	# Step 2: Detect language
	language = detect_language(transcript) if transcript else "Hindi"
	print(f"[{call_id}] Language: {language}")

	# Step 3: Generate AI reply
	ai_text = generate_reply(transcript or "Hello", language)
	print(f"[{call_id}] AI Reply: {ai_text}")

	# Step 4: Convert to speech with Edge TTS
	response_mp3 = RESPONSES_DIR / f"{call_id}.mp3"
	await synthesize_tts(ai_text, language, response_mp3)

	# Step 5: Save log
	log_item = {
	"call_id": call_id,
	"created_at": datetime.utcnow().isoformat() + "Z",
	"phone": phone,
	"device_id": device_id,
	"language": language,
	"meta": json.loads(meta or "{}"),
	"audio_path": str(raw_audio_path),
	"transcript": transcript,
	"reply_text": ai_text,
	"reply_audio_path": str(response_mp3),
	}
	(LOGS_DIR / f"{call_id}.json").write_text(
	json.dumps(log_item, indent=2, ensure_ascii=False), encoding="utf-8"
	)

	return JSONResponse({
	"call_id": call_id,
	"transcript": transcript,
	"reply_text": ai_text,
	"reply_audio_url": f"/calls/response/{call_id}",
	"language": language,
	"language_hint": language_hint or "auto-detected",
	"stt_model": GROQ_STT_MODEL,
	})


	@app.get("/calls/response/{call_id}")
	def get_response(call_id: str):
	"""React Native app fetches this MP3 and plays it during the call."""
	mp3 = RESPONSES_DIR / f"{call_id}.mp3"
	if not mp3.exists():
	return JSONResponse({"error": "not_found"}, status_code=404)
	return FileResponse(mp3, media_type="audio/mpeg", filename=f"{call_id}.mp3")


	@app.get("/calls/logs")
	def get_logs(limit: int = 20):
	"""View recent call logs."""
	logs = sorted(LOGS_DIR.glob("*.json"), reverse=True)[:limit]
	return [json.loads(f.read_text(encoding="utf-8")) for f in logs]


	@app.get("/calls/logs/{call_id}")
	def get_log(call_id: str):
	"""View log for a specific call."""
	log_file = LOGS_DIR / f"{call_id}.json"
	if not log_file.exists():
	return JSONResponse({"error": "not_found"}, status_code=404)
	return json.loads(log_file.read_text(encoding="utf-8"))