import os import json import asyncio import re from pathlib import Path from datetime import datetime from fastapi import FastAPI, UploadFile, File, Form from fastapi.responses import JSONResponse, FileResponse import edge_tts from groq import Groq from google import genai app = FastAPI(title="Voice Call AI Bridge Backend") BASE_DIR = Path(__file__).resolve().parent DATA_DIR = BASE_DIR / "data" RECORDINGS_DIR = DATA_DIR / "recordings" RESPONSES_DIR = DATA_DIR / "responses" LOGS_DIR = DATA_DIR / "logs" for d in (RECORDINGS_DIR, RESPONSES_DIR, LOGS_DIR): d.mkdir(parents=True, exist_ok=True) # ================================================================ # DYNAMIC API KEYS (unlimited) # ================================================================ GROQ_API_KEYS = [] i = 1 while True: key = os.getenv(f"GROQ_API_KEY_{i}") if not key: # also try plain GROQ_API_KEY for backward compat key = os.getenv("GROQ_API_KEY") if i == 1 else None if not key: break GROQ_API_KEYS.append(key) i += 1 GEMINI_API_KEYS = [] i = 1 while True: key = os.getenv(f"GEMINI_API_KEY_{i}") if not key: break GEMINI_API_KEYS.append(key) i += 1 GROQ_CHAT_MODEL = os.getenv("GROQ_CHAT_MODEL", "llama-3.3-70b-versatile") GROQ_STT_MODEL = "whisper-large-v3" # large-v3 has better Indian language support than turbo GEMINI_MODEL = "gemini-2.0-flash" # Whisper language codes — telling Whisper the language upfront fixes accuracy dramatically WHISPER_LANG_CODES = { "gujarati": "gu", "hindi": "hi", "english": "en", } # App sends language hint from user's phone locale/preference # Fallback: transcribe twice (once with gu, once auto) and pick longer result print(f"Loaded {len(GROQ_API_KEYS)} Groq key(s), {len(GEMINI_API_KEYS)} Gemini key(s).") # ================================================================ # MICROSOFT EDGE NEURAL VOICES (free, no API key, excellent quality) # ================================================================ EDGE_VOICES = { "English": "en-IN-NeerjaNeural", # Indian English, natural "Hindi": "hi-IN-SwaraNeural", # Hindi female, very smooth "Gujarati": "gu-IN-DhwaniNeural", # Gujarati female, native quality } # ================================================================ # LANGUAGE DETECTION # ================================================================ def detect_language(text: str) -> str: for char in text: if '\u0A80' <= char <= '\u0AFF': return "Gujarati" if '\u0900' <= char <= '\u097F': return "Hindi" return "English" # ================================================================ # STT: GROQ WHISPER (tries all keys) # ================================================================ def transcribe_audio(path: Path, language_hint: str = None) -> str: """ Transcribe audio with Whisper. language_hint: "gujarati", "hindi", or "english" — dramatically improves accuracy. If not provided, tries Gujarati + Hindi + auto and picks the best result. """ if not GROQ_API_KEYS: return "(transcription skipped: no GROQ_API_KEY configured)" def _transcribe_with_lang(key: str, lang_code: str = None) -> str: client = Groq(api_key=key) with path.open("rb") as audio: kwargs = dict( model=GROQ_STT_MODEL, file=audio, response_format="verbose_json", ) if lang_code: kwargs["language"] = lang_code # explicit language = much better accuracy result = client.audio.transcriptions.create(**kwargs) text = getattr(result, "text", None) or (result.get("text", "") if isinstance(result, dict) else "") return text.strip() for index, key in enumerate(GROQ_API_KEYS): try: if language_hint: # User told us the language — use it directly lang_code = WHISPER_LANG_CODES.get(language_hint.lower()) text = _transcribe_with_lang(key, lang_code) print(f"[STT] Key #{index+1} ({language_hint}): {text[:60]}") return text else: # No hint — try Gujarati, Hindi, and auto; pick longest meaningful result results = {} for lang_name, lang_code in WHISPER_LANG_CODES.items(): try: t = _transcribe_with_lang(key, lang_code) if t and t not in [".", "", " "]: results[lang_name] = t print(f"[STT] {lang_name} attempt: {t[:50]}") except Exception: pass if not results: continue def has_gujarati_script(t): return any('઀' <= c <= '૿' for c in t) def has_hindi_script(t): return any('ऀ' <= c <= 'ॿ' for c in t) def is_transliterated_english(gujarati_text, english_text): """ Detect if Whisper just wrote English words in Gujarati script. Strategy: count how many English words appear phonetically in Gujarati text. Common English loanwords in Gujarati script are a giveaway. Also: if English result has meaningful words and Gujarati has same word count, it's likely transliteration. """ # Common English words that Whisper writes in Gujarati script when confused english_in_gujarati_markers = [ 'એન', 'ધ', 'ઈન', 'ઈઝ', 'ઓફ', 'ટો', 'એ', 'કેન', 'યુ', 'આઈ', 'વી', 'ઓહ', 'હાઈ', 'ઓકે', 'યસ', 'નો', 'હેલો', 'ટોક', 'સ્પીક', 'ઈટ', 'માય', 'યોર', 'ઈઝ' ] marker_count = sum(1 for m in english_in_gujarati_markers if m in gujarati_text) # If 2+ English markers found in Gujarati text, it's transliteration if marker_count >= 2: return True # If english result is meaningful and gujarati word count matches english guj_words = len(gujarati_text.split()) eng_words = len(english_text.split()) if english_text and abs(guj_words - eng_words) <= 1 and eng_words > 1: # Same number of words = same sentence just transliterated return True return False gu_text = results.get("gujarati", "") hi_text = results.get("hindi", "") en_text = results.get("english", "") if gu_text and has_gujarati_script(gu_text) and not is_transliterated_english(gu_text, en_text): best = gu_text print(f"[STT] Real Gujarati detected: {best[:60]}") elif hi_text and has_hindi_script(hi_text): best = hi_text print(f"[STT] Real Hindi detected: {best[:60]}") else: best = en_text or gu_text or list(results.values())[0] print(f"[STT] English/fallback selected: {best[:60]}") return best except Exception as e: print(f"[STT] Key #{index+1} failed: {e}") continue return "" # ================================================================ # AI: GROQ → GEMINI FALLBACK # ================================================================ def build_prompt(user_text: str, language: str) -> str: return ( f"You are a helpful voice assistant for rural villagers in India.\n" f"You help with: farming tips, crop care, weather advice, government schemes, general questions.\n" f"STRICT RULES:\n" f"- Detect and reply ONLY in {language}. Use correct script.\n" f"- Reply must be SHORT — this is a PHONE CALL. Maximum 2-3 sentences.\n" f"- NO bullet points, NO lists, NO markdown. Speak naturally.\n" f"- Be warm, simple, and clear for a rural farmer.\n" f"\nUser said: {user_text}\n" f"Your spoken reply ({language}, 2-3 sentences max):" ) def try_groq_chat(prompt: str) -> str | None: for index, key in enumerate(GROQ_API_KEYS): try: client = Groq(api_key=key) resp = client.chat.completions.create( model=GROQ_CHAT_MODEL, messages=[{"role": "user", "content": prompt}], max_tokens=120, temperature=0.4, ) result = resp.choices[0].message.content.strip() print(f"[AI/Groq] Key #{index+1} success") return result except Exception as e: if "429" in str(e) or "rate" in str(e).lower(): print(f"[AI/Groq] Key #{index+1} rate limited, trying next...") else: print(f"[AI/Groq] Key #{index+1} error: {e}") continue return None def try_gemini_chat(prompt: str) -> str | None: for index, key in enumerate(GEMINI_API_KEYS): try: client = genai.Client(api_key=key) resp = client.models.generate_content(model=GEMINI_MODEL, contents=prompt) print(f"[AI/Gemini] Key #{index+1} success") return resp.text.strip() except Exception as e: if "429" in str(e) or "quota" in str(e).lower(): print(f"[AI/Gemini] Key #{index+1} quota exceeded, trying next...") else: print(f"[AI/Gemini] Key #{index+1} error: {e}") continue return None def generate_reply(user_text: str, language: str) -> str: prompt = build_prompt(user_text, language) result = try_groq_chat(prompt) or try_gemini_chat(prompt) if not result: return { "English": "Sorry, I could not process your request. Please try again.", "Hindi": "माफ करें, अभी जवाब देने में असमर्थ हूँ। कृपया दोबारा कोशिश करें।", "Gujarati": "માફ કરશો, હું હમણાં જવાબ આપી શકતો નથી. ફરી પ્રયાસ કરો." }.get(language, "Sorry, unable to respond now.") # Clean for speech result = re.sub(r"[*_`#]", "", result) result = result.replace("\n", " ").strip() return result # ================================================================ # TTS: MICROSOFT EDGE NEURAL (free, excellent Gujarati/Hindi) # ================================================================ async def synthesize_tts(text: str, language: str, output_path: Path) -> None: """Async TTS — must be awaited. Works correctly inside FastAPI/uvicorn.""" voice = EDGE_VOICES.get(language, EDGE_VOICES["English"]) print(f"[TTS] Using voice: {voice} for {language}") communicate = edge_tts.Communicate(text, voice) await communicate.save(str(output_path)) # ================================================================ # HELPERS # ================================================================ def now_id() -> str: return datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f") # ================================================================ # ENDPOINTS # ================================================================ @app.get("/health") def health(): return { "ok": True, "groq_keys": len(GROQ_API_KEYS), "gemini_keys": len(GEMINI_API_KEYS), "tts_engine": "Microsoft Edge Neural TTS", "voices": EDGE_VOICES, "languages": ["English", "Hindi", "Gujarati"] } @app.post("/calls/upload") async def upload_call( phone: str = Form(default=""), device_id: str = Form(default="android"), meta: str = Form(default="{}"), language_hint: str = Form(default=""), # "gujarati", "hindi", "english" or "" audio_file: UploadFile = File(...), ): """ React Native app sends: - audio_file: the recorded call audio (m4a/wav/mp3) - phone: caller's number - device_id: your gateway device ID - meta: any extra JSON metadata Returns: - transcript: what user said - reply_text: AI response - reply_audio_url: URL to fetch the MP3 voice response - language: detected language """ call_id = now_id() # Save uploaded audio ext = Path(audio_file.filename or "call.m4a").suffix or ".m4a" raw_audio_path = RECORDINGS_DIR / f"{call_id}{ext}" raw_audio_path.write_bytes(await audio_file.read()) print(f"[{call_id}] Audio saved: {raw_audio_path}") # Step 1: Transcribe with Groq Whisper transcript = transcribe_audio(raw_audio_path, language_hint=language_hint or None) print(f"[{call_id}] Transcript: {transcript}") # Step 2: Detect language language = detect_language(transcript) if transcript else "Hindi" print(f"[{call_id}] Language: {language}") # Step 3: Generate AI reply ai_text = generate_reply(transcript or "Hello", language) print(f"[{call_id}] AI Reply: {ai_text}") # Step 4: Convert to speech with Edge TTS response_mp3 = RESPONSES_DIR / f"{call_id}.mp3" await synthesize_tts(ai_text, language, response_mp3) # Step 5: Save log log_item = { "call_id": call_id, "created_at": datetime.utcnow().isoformat() + "Z", "phone": phone, "device_id": device_id, "language": language, "meta": json.loads(meta or "{}"), "audio_path": str(raw_audio_path), "transcript": transcript, "reply_text": ai_text, "reply_audio_path": str(response_mp3), } (LOGS_DIR / f"{call_id}.json").write_text( json.dumps(log_item, indent=2, ensure_ascii=False), encoding="utf-8" ) return JSONResponse({ "call_id": call_id, "transcript": transcript, "reply_text": ai_text, "reply_audio_url": f"/calls/response/{call_id}", "language": language, "language_hint": language_hint or "auto-detected", "stt_model": GROQ_STT_MODEL, }) @app.get("/calls/response/{call_id}") def get_response(call_id: str): """React Native app fetches this MP3 and plays it during the call.""" mp3 = RESPONSES_DIR / f"{call_id}.mp3" if not mp3.exists(): return JSONResponse({"error": "not_found"}, status_code=404) return FileResponse(mp3, media_type="audio/mpeg", filename=f"{call_id}.mp3") @app.get("/calls/logs") def get_logs(limit: int = 20): """View recent call logs.""" logs = sorted(LOGS_DIR.glob("*.json"), reverse=True)[:limit] return [json.loads(f.read_text(encoding="utf-8")) for f in logs] @app.get("/calls/logs/{call_id}") def get_log(call_id: str): """View log for a specific call.""" log_file = LOGS_DIR / f"{call_id}.json" if not log_file.exists(): return JSONResponse({"error": "not_found"}, status_code=404) return json.loads(log_file.read_text(encoding="utf-8"))