ai_call / server.py
omgy's picture
Update server.py
f03a824 verified
import os
import json
import asyncio
import re
from pathlib import Path
from datetime import datetime
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse, FileResponse
import edge_tts
from groq import Groq
from google import genai
app = FastAPI(title="Voice Call AI Bridge Backend")
BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / "data"
RECORDINGS_DIR = DATA_DIR / "recordings"
RESPONSES_DIR = DATA_DIR / "responses"
LOGS_DIR = DATA_DIR / "logs"
for d in (RECORDINGS_DIR, RESPONSES_DIR, LOGS_DIR):
d.mkdir(parents=True, exist_ok=True)
# ================================================================
# DYNAMIC API KEYS (unlimited)
# ================================================================
GROQ_API_KEYS = []
i = 1
while True:
key = os.getenv(f"GROQ_API_KEY_{i}")
if not key:
# also try plain GROQ_API_KEY for backward compat
key = os.getenv("GROQ_API_KEY") if i == 1 else None
if not key:
break
GROQ_API_KEYS.append(key)
i += 1
GEMINI_API_KEYS = []
i = 1
while True:
key = os.getenv(f"GEMINI_API_KEY_{i}")
if not key:
break
GEMINI_API_KEYS.append(key)
i += 1
GROQ_CHAT_MODEL = os.getenv("GROQ_CHAT_MODEL", "llama-3.3-70b-versatile")
GROQ_STT_MODEL = "whisper-large-v3" # large-v3 has better Indian language support than turbo
GEMINI_MODEL = "gemini-2.0-flash"
# Whisper language codes — telling Whisper the language upfront fixes accuracy dramatically
WHISPER_LANG_CODES = {
"gujarati": "gu",
"hindi": "hi",
"english": "en",
}
# App sends language hint from user's phone locale/preference
# Fallback: transcribe twice (once with gu, once auto) and pick longer result
print(f"Loaded {len(GROQ_API_KEYS)} Groq key(s), {len(GEMINI_API_KEYS)} Gemini key(s).")
# ================================================================
# MICROSOFT EDGE NEURAL VOICES (free, no API key, excellent quality)
# ================================================================
EDGE_VOICES = {
"English": "en-IN-NeerjaNeural", # Indian English, natural
"Hindi": "hi-IN-SwaraNeural", # Hindi female, very smooth
"Gujarati": "gu-IN-DhwaniNeural", # Gujarati female, native quality
}
# ================================================================
# LANGUAGE DETECTION
# ================================================================
def detect_language(text: str) -> str:
for char in text:
if '\u0A80' <= char <= '\u0AFF':
return "Gujarati"
if '\u0900' <= char <= '\u097F':
return "Hindi"
return "English"
# ================================================================
# STT: GROQ WHISPER (tries all keys)
# ================================================================
def transcribe_audio(path: Path, language_hint: str = None) -> str:
"""
Transcribe audio with Whisper.
language_hint: "gujarati", "hindi", or "english" — dramatically improves accuracy.
If not provided, tries Gujarati + Hindi + auto and picks the best result.
"""
if not GROQ_API_KEYS:
return "(transcription skipped: no GROQ_API_KEY configured)"
def _transcribe_with_lang(key: str, lang_code: str = None) -> str:
client = Groq(api_key=key)
with path.open("rb") as audio:
kwargs = dict(
model=GROQ_STT_MODEL,
file=audio,
response_format="verbose_json",
)
if lang_code:
kwargs["language"] = lang_code # explicit language = much better accuracy
result = client.audio.transcriptions.create(**kwargs)
text = getattr(result, "text", None) or (result.get("text", "") if isinstance(result, dict) else "")
return text.strip()
for index, key in enumerate(GROQ_API_KEYS):
try:
if language_hint:
# User told us the language — use it directly
lang_code = WHISPER_LANG_CODES.get(language_hint.lower())
text = _transcribe_with_lang(key, lang_code)
print(f"[STT] Key #{index+1} ({language_hint}): {text[:60]}")
return text
else:
# No hint — try Gujarati, Hindi, and auto; pick longest meaningful result
results = {}
for lang_name, lang_code in WHISPER_LANG_CODES.items():
try:
t = _transcribe_with_lang(key, lang_code)
if t and t not in [".", "", " "]:
results[lang_name] = t
print(f"[STT] {lang_name} attempt: {t[:50]}")
except Exception:
pass
if not results:
continue
def has_gujarati_script(t):
return any('઀' <= c <= '૿' for c in t)
def has_hindi_script(t):
return any('ऀ' <= c <= 'ॿ' for c in t)
def is_transliterated_english(gujarati_text, english_text):
"""
Detect if Whisper just wrote English words in Gujarati script.
Strategy: count how many English words appear phonetically in Gujarati text.
Common English loanwords in Gujarati script are a giveaway.
Also: if English result has meaningful words and Gujarati has same word count,
it's likely transliteration.
"""
# Common English words that Whisper writes in Gujarati script when confused
english_in_gujarati_markers = [
'એન', 'ધ', 'ઈન', 'ઈઝ', 'ઓફ', 'ટો', 'એ', 'કેન', 'યુ',
'આઈ', 'વી', 'ઓહ', 'હાઈ', 'ઓકે', 'યસ', 'નો', 'હેલો',
'ટોક', 'સ્પીક', 'ઈટ', 'માય', 'યોર', 'ઈઝ'
]
marker_count = sum(1 for m in english_in_gujarati_markers if m in gujarati_text)
# If 2+ English markers found in Gujarati text, it's transliteration
if marker_count >= 2:
return True
# If english result is meaningful and gujarati word count matches english
guj_words = len(gujarati_text.split())
eng_words = len(english_text.split())
if english_text and abs(guj_words - eng_words) <= 1 and eng_words > 1:
# Same number of words = same sentence just transliterated
return True
return False
gu_text = results.get("gujarati", "")
hi_text = results.get("hindi", "")
en_text = results.get("english", "")
if gu_text and has_gujarati_script(gu_text) and not is_transliterated_english(gu_text, en_text):
best = gu_text
print(f"[STT] Real Gujarati detected: {best[:60]}")
elif hi_text and has_hindi_script(hi_text):
best = hi_text
print(f"[STT] Real Hindi detected: {best[:60]}")
else:
best = en_text or gu_text or list(results.values())[0]
print(f"[STT] English/fallback selected: {best[:60]}")
return best
except Exception as e:
print(f"[STT] Key #{index+1} failed: {e}")
continue
return ""
# ================================================================
# AI: GROQ → GEMINI FALLBACK
# ================================================================
def build_prompt(user_text: str, language: str) -> str:
return (
f"You are a helpful voice assistant for rural villagers in India.\n"
f"You help with: farming tips, crop care, weather advice, government schemes, general questions.\n"
f"STRICT RULES:\n"
f"- Detect and reply ONLY in {language}. Use correct script.\n"
f"- Reply must be SHORT — this is a PHONE CALL. Maximum 2-3 sentences.\n"
f"- NO bullet points, NO lists, NO markdown. Speak naturally.\n"
f"- Be warm, simple, and clear for a rural farmer.\n"
f"\nUser said: {user_text}\n"
f"Your spoken reply ({language}, 2-3 sentences max):"
)
def try_groq_chat(prompt: str) -> str | None:
for index, key in enumerate(GROQ_API_KEYS):
try:
client = Groq(api_key=key)
resp = client.chat.completions.create(
model=GROQ_CHAT_MODEL,
messages=[{"role": "user", "content": prompt}],
max_tokens=120,
temperature=0.4,
)
result = resp.choices[0].message.content.strip()
print(f"[AI/Groq] Key #{index+1} success")
return result
except Exception as e:
if "429" in str(e) or "rate" in str(e).lower():
print(f"[AI/Groq] Key #{index+1} rate limited, trying next...")
else:
print(f"[AI/Groq] Key #{index+1} error: {e}")
continue
return None
def try_gemini_chat(prompt: str) -> str | None:
for index, key in enumerate(GEMINI_API_KEYS):
try:
client = genai.Client(api_key=key)
resp = client.models.generate_content(model=GEMINI_MODEL, contents=prompt)
print(f"[AI/Gemini] Key #{index+1} success")
return resp.text.strip()
except Exception as e:
if "429" in str(e) or "quota" in str(e).lower():
print(f"[AI/Gemini] Key #{index+1} quota exceeded, trying next...")
else:
print(f"[AI/Gemini] Key #{index+1} error: {e}")
continue
return None
def generate_reply(user_text: str, language: str) -> str:
prompt = build_prompt(user_text, language)
result = try_groq_chat(prompt) or try_gemini_chat(prompt)
if not result:
return {
"English": "Sorry, I could not process your request. Please try again.",
"Hindi": "माफ करें, अभी जवाब देने में असमर्थ हूँ। कृपया दोबारा कोशिश करें।",
"Gujarati": "માફ કરશો, હું હમણાં જવાબ આપી શકતો નથી. ફરી પ્રયાસ કરો."
}.get(language, "Sorry, unable to respond now.")
# Clean for speech
result = re.sub(r"[*_`#]", "", result)
result = result.replace("\n", " ").strip()
return result
# ================================================================
# TTS: MICROSOFT EDGE NEURAL (free, excellent Gujarati/Hindi)
# ================================================================
async def synthesize_tts(text: str, language: str, output_path: Path) -> None:
"""Async TTS — must be awaited. Works correctly inside FastAPI/uvicorn."""
voice = EDGE_VOICES.get(language, EDGE_VOICES["English"])
print(f"[TTS] Using voice: {voice} for {language}")
communicate = edge_tts.Communicate(text, voice)
await communicate.save(str(output_path))
# ================================================================
# HELPERS
# ================================================================
def now_id() -> str:
return datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
# ================================================================
# ENDPOINTS
# ================================================================
@app.get("/health")
def health():
return {
"ok": True,
"groq_keys": len(GROQ_API_KEYS),
"gemini_keys": len(GEMINI_API_KEYS),
"tts_engine": "Microsoft Edge Neural TTS",
"voices": EDGE_VOICES,
"languages": ["English", "Hindi", "Gujarati"]
}
@app.post("/calls/upload")
async def upload_call(
phone: str = Form(default=""),
device_id: str = Form(default="android"),
meta: str = Form(default="{}"),
language_hint: str = Form(default=""), # "gujarati", "hindi", "english" or ""
audio_file: UploadFile = File(...),
):
"""
React Native app sends:
- audio_file: the recorded call audio (m4a/wav/mp3)
- phone: caller's number
- device_id: your gateway device ID
- meta: any extra JSON metadata
Returns:
- transcript: what user said
- reply_text: AI response
- reply_audio_url: URL to fetch the MP3 voice response
- language: detected language
"""
call_id = now_id()
# Save uploaded audio
ext = Path(audio_file.filename or "call.m4a").suffix or ".m4a"
raw_audio_path = RECORDINGS_DIR / f"{call_id}{ext}"
raw_audio_path.write_bytes(await audio_file.read())
print(f"[{call_id}] Audio saved: {raw_audio_path}")
# Step 1: Transcribe with Groq Whisper
transcript = transcribe_audio(raw_audio_path, language_hint=language_hint or None)
print(f"[{call_id}] Transcript: {transcript}")
# Step 2: Detect language
language = detect_language(transcript) if transcript else "Hindi"
print(f"[{call_id}] Language: {language}")
# Step 3: Generate AI reply
ai_text = generate_reply(transcript or "Hello", language)
print(f"[{call_id}] AI Reply: {ai_text}")
# Step 4: Convert to speech with Edge TTS
response_mp3 = RESPONSES_DIR / f"{call_id}.mp3"
await synthesize_tts(ai_text, language, response_mp3)
# Step 5: Save log
log_item = {
"call_id": call_id,
"created_at": datetime.utcnow().isoformat() + "Z",
"phone": phone,
"device_id": device_id,
"language": language,
"meta": json.loads(meta or "{}"),
"audio_path": str(raw_audio_path),
"transcript": transcript,
"reply_text": ai_text,
"reply_audio_path": str(response_mp3),
}
(LOGS_DIR / f"{call_id}.json").write_text(
json.dumps(log_item, indent=2, ensure_ascii=False), encoding="utf-8"
)
return JSONResponse({
"call_id": call_id,
"transcript": transcript,
"reply_text": ai_text,
"reply_audio_url": f"/calls/response/{call_id}",
"language": language,
"language_hint": language_hint or "auto-detected",
"stt_model": GROQ_STT_MODEL,
})
@app.get("/calls/response/{call_id}")
def get_response(call_id: str):
"""React Native app fetches this MP3 and plays it during the call."""
mp3 = RESPONSES_DIR / f"{call_id}.mp3"
if not mp3.exists():
return JSONResponse({"error": "not_found"}, status_code=404)
return FileResponse(mp3, media_type="audio/mpeg", filename=f"{call_id}.mp3")
@app.get("/calls/logs")
def get_logs(limit: int = 20):
"""View recent call logs."""
logs = sorted(LOGS_DIR.glob("*.json"), reverse=True)[:limit]
return [json.loads(f.read_text(encoding="utf-8")) for f in logs]
@app.get("/calls/logs/{call_id}")
def get_log(call_id: str):
"""View log for a specific call."""
log_file = LOGS_DIR / f"{call_id}.json"
if not log_file.exists():
return JSONResponse({"error": "not_found"}, status_code=404)
return json.loads(log_file.read_text(encoding="utf-8"))