| import os |
| import json |
| import asyncio |
| import re |
| from pathlib import Path |
| from datetime import datetime |
| from fastapi import FastAPI, UploadFile, File, Form |
| from fastapi.responses import JSONResponse, FileResponse |
| import edge_tts |
| from groq import Groq |
| from google import genai |
|
|
| app = FastAPI(title="Voice Call AI Bridge Backend") |
|
|
| BASE_DIR = Path(__file__).resolve().parent |
| DATA_DIR = BASE_DIR / "data" |
| RECORDINGS_DIR = DATA_DIR / "recordings" |
| RESPONSES_DIR = DATA_DIR / "responses" |
| LOGS_DIR = DATA_DIR / "logs" |
|
|
| for d in (RECORDINGS_DIR, RESPONSES_DIR, LOGS_DIR): |
| d.mkdir(parents=True, exist_ok=True) |
|
|
| |
| |
| |
| GROQ_API_KEYS = [] |
| i = 1 |
| while True: |
| key = os.getenv(f"GROQ_API_KEY_{i}") |
| if not key: |
| |
| key = os.getenv("GROQ_API_KEY") if i == 1 else None |
| if not key: |
| break |
| GROQ_API_KEYS.append(key) |
| i += 1 |
|
|
| GEMINI_API_KEYS = [] |
| i = 1 |
| while True: |
| key = os.getenv(f"GEMINI_API_KEY_{i}") |
| if not key: |
| break |
| GEMINI_API_KEYS.append(key) |
| i += 1 |
|
|
| GROQ_CHAT_MODEL = os.getenv("GROQ_CHAT_MODEL", "llama-3.3-70b-versatile") |
| GROQ_STT_MODEL = "whisper-large-v3" |
| GEMINI_MODEL = "gemini-2.0-flash" |
|
|
| |
| WHISPER_LANG_CODES = { |
| "gujarati": "gu", |
| "hindi": "hi", |
| "english": "en", |
| } |
|
|
| |
| |
|
|
| print(f"Loaded {len(GROQ_API_KEYS)} Groq key(s), {len(GEMINI_API_KEYS)} Gemini key(s).") |
|
|
| |
| |
| |
| EDGE_VOICES = { |
| "English": "en-IN-NeerjaNeural", |
| "Hindi": "hi-IN-SwaraNeural", |
| "Gujarati": "gu-IN-DhwaniNeural", |
| } |
|
|
| |
| |
| |
| def detect_language(text: str) -> str: |
| for char in text: |
| if '\u0A80' <= char <= '\u0AFF': |
| return "Gujarati" |
| if '\u0900' <= char <= '\u097F': |
| return "Hindi" |
| return "English" |
|
|
| |
| |
| |
| def transcribe_audio(path: Path, language_hint: str = None) -> str: |
| """ |
| Transcribe audio with Whisper. |
| language_hint: "gujarati", "hindi", or "english" — dramatically improves accuracy. |
| If not provided, tries Gujarati + Hindi + auto and picks the best result. |
| """ |
| if not GROQ_API_KEYS: |
| return "(transcription skipped: no GROQ_API_KEY configured)" |
|
|
| def _transcribe_with_lang(key: str, lang_code: str = None) -> str: |
| client = Groq(api_key=key) |
| with path.open("rb") as audio: |
| kwargs = dict( |
| model=GROQ_STT_MODEL, |
| file=audio, |
| response_format="verbose_json", |
| ) |
| if lang_code: |
| kwargs["language"] = lang_code |
| result = client.audio.transcriptions.create(**kwargs) |
| text = getattr(result, "text", None) or (result.get("text", "") if isinstance(result, dict) else "") |
| return text.strip() |
|
|
| for index, key in enumerate(GROQ_API_KEYS): |
| try: |
| if language_hint: |
| |
| lang_code = WHISPER_LANG_CODES.get(language_hint.lower()) |
| text = _transcribe_with_lang(key, lang_code) |
| print(f"[STT] Key #{index+1} ({language_hint}): {text[:60]}") |
| return text |
|
|
| else: |
| |
| results = {} |
| for lang_name, lang_code in WHISPER_LANG_CODES.items(): |
| try: |
| t = _transcribe_with_lang(key, lang_code) |
| if t and t not in [".", "", " "]: |
| results[lang_name] = t |
| print(f"[STT] {lang_name} attempt: {t[:50]}") |
| except Exception: |
| pass |
|
|
| if not results: |
| continue |
|
|
| def has_gujarati_script(t): |
| return any('' <= c <= '૿' for c in t) |
|
|
| def has_hindi_script(t): |
| return any('ऀ' <= c <= 'ॿ' for c in t) |
|
|
| def is_transliterated_english(gujarati_text, english_text): |
| """ |
| Detect if Whisper just wrote English words in Gujarati script. |
| Strategy: count how many English words appear phonetically in Gujarati text. |
| Common English loanwords in Gujarati script are a giveaway. |
| Also: if English result has meaningful words and Gujarati has same word count, |
| it's likely transliteration. |
| """ |
| |
| english_in_gujarati_markers = [ |
| 'એન', 'ધ', 'ઈન', 'ઈઝ', 'ઓફ', 'ટો', 'એ', 'કેન', 'યુ', |
| 'આઈ', 'વી', 'ઓહ', 'હાઈ', 'ઓકે', 'યસ', 'નો', 'હેલો', |
| 'ટોક', 'સ્પીક', 'ઈટ', 'માય', 'યોર', 'ઈઝ' |
| ] |
| marker_count = sum(1 for m in english_in_gujarati_markers if m in gujarati_text) |
| |
| if marker_count >= 2: |
| return True |
| |
| guj_words = len(gujarati_text.split()) |
| eng_words = len(english_text.split()) |
| if english_text and abs(guj_words - eng_words) <= 1 and eng_words > 1: |
| |
| return True |
| return False |
|
|
| gu_text = results.get("gujarati", "") |
| hi_text = results.get("hindi", "") |
| en_text = results.get("english", "") |
|
|
| if gu_text and has_gujarati_script(gu_text) and not is_transliterated_english(gu_text, en_text): |
| best = gu_text |
| print(f"[STT] Real Gujarati detected: {best[:60]}") |
| elif hi_text and has_hindi_script(hi_text): |
| best = hi_text |
| print(f"[STT] Real Hindi detected: {best[:60]}") |
| else: |
| best = en_text or gu_text or list(results.values())[0] |
| print(f"[STT] English/fallback selected: {best[:60]}") |
|
|
| return best |
|
|
| except Exception as e: |
| print(f"[STT] Key #{index+1} failed: {e}") |
| continue |
|
|
| return "" |
|
|
| |
| |
| |
| def build_prompt(user_text: str, language: str) -> str: |
| return ( |
| f"You are a helpful voice assistant for rural villagers in India.\n" |
| f"You help with: farming tips, crop care, weather advice, government schemes, general questions.\n" |
| f"STRICT RULES:\n" |
| f"- Detect and reply ONLY in {language}. Use correct script.\n" |
| f"- Reply must be SHORT — this is a PHONE CALL. Maximum 2-3 sentences.\n" |
| f"- NO bullet points, NO lists, NO markdown. Speak naturally.\n" |
| f"- Be warm, simple, and clear for a rural farmer.\n" |
| f"\nUser said: {user_text}\n" |
| f"Your spoken reply ({language}, 2-3 sentences max):" |
| ) |
|
|
| def try_groq_chat(prompt: str) -> str | None: |
| for index, key in enumerate(GROQ_API_KEYS): |
| try: |
| client = Groq(api_key=key) |
| resp = client.chat.completions.create( |
| model=GROQ_CHAT_MODEL, |
| messages=[{"role": "user", "content": prompt}], |
| max_tokens=120, |
| temperature=0.4, |
| ) |
| result = resp.choices[0].message.content.strip() |
| print(f"[AI/Groq] Key #{index+1} success") |
| return result |
| except Exception as e: |
| if "429" in str(e) or "rate" in str(e).lower(): |
| print(f"[AI/Groq] Key #{index+1} rate limited, trying next...") |
| else: |
| print(f"[AI/Groq] Key #{index+1} error: {e}") |
| continue |
| return None |
|
|
| def try_gemini_chat(prompt: str) -> str | None: |
| for index, key in enumerate(GEMINI_API_KEYS): |
| try: |
| client = genai.Client(api_key=key) |
| resp = client.models.generate_content(model=GEMINI_MODEL, contents=prompt) |
| print(f"[AI/Gemini] Key #{index+1} success") |
| return resp.text.strip() |
| except Exception as e: |
| if "429" in str(e) or "quota" in str(e).lower(): |
| print(f"[AI/Gemini] Key #{index+1} quota exceeded, trying next...") |
| else: |
| print(f"[AI/Gemini] Key #{index+1} error: {e}") |
| continue |
| return None |
|
|
| def generate_reply(user_text: str, language: str) -> str: |
| prompt = build_prompt(user_text, language) |
| result = try_groq_chat(prompt) or try_gemini_chat(prompt) |
|
|
| if not result: |
| return { |
| "English": "Sorry, I could not process your request. Please try again.", |
| "Hindi": "माफ करें, अभी जवाब देने में असमर्थ हूँ। कृपया दोबारा कोशिश करें।", |
| "Gujarati": "માફ કરશો, હું હમણાં જવાબ આપી શકતો નથી. ફરી પ્રયાસ કરો." |
| }.get(language, "Sorry, unable to respond now.") |
|
|
| |
| result = re.sub(r"[*_`#]", "", result) |
| result = result.replace("\n", " ").strip() |
| return result |
|
|
| |
| |
| |
| async def synthesize_tts(text: str, language: str, output_path: Path) -> None: |
| """Async TTS — must be awaited. Works correctly inside FastAPI/uvicorn.""" |
| voice = EDGE_VOICES.get(language, EDGE_VOICES["English"]) |
| print(f"[TTS] Using voice: {voice} for {language}") |
| communicate = edge_tts.Communicate(text, voice) |
| await communicate.save(str(output_path)) |
|
|
| |
| |
| |
| def now_id() -> str: |
| return datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f") |
|
|
| |
| |
| |
| @app.get("/health") |
| def health(): |
| return { |
| "ok": True, |
| "groq_keys": len(GROQ_API_KEYS), |
| "gemini_keys": len(GEMINI_API_KEYS), |
| "tts_engine": "Microsoft Edge Neural TTS", |
| "voices": EDGE_VOICES, |
| "languages": ["English", "Hindi", "Gujarati"] |
| } |
|
|
|
|
| @app.post("/calls/upload") |
| async def upload_call( |
| phone: str = Form(default=""), |
| device_id: str = Form(default="android"), |
| meta: str = Form(default="{}"), |
| language_hint: str = Form(default=""), |
| audio_file: UploadFile = File(...), |
| ): |
| """ |
| React Native app sends: |
| - audio_file: the recorded call audio (m4a/wav/mp3) |
| - phone: caller's number |
| - device_id: your gateway device ID |
| - meta: any extra JSON metadata |
| |
| Returns: |
| - transcript: what user said |
| - reply_text: AI response |
| - reply_audio_url: URL to fetch the MP3 voice response |
| - language: detected language |
| """ |
| call_id = now_id() |
|
|
| |
| ext = Path(audio_file.filename or "call.m4a").suffix or ".m4a" |
| raw_audio_path = RECORDINGS_DIR / f"{call_id}{ext}" |
| raw_audio_path.write_bytes(await audio_file.read()) |
| print(f"[{call_id}] Audio saved: {raw_audio_path}") |
|
|
| |
| transcript = transcribe_audio(raw_audio_path, language_hint=language_hint or None) |
| print(f"[{call_id}] Transcript: {transcript}") |
|
|
| |
| language = detect_language(transcript) if transcript else "Hindi" |
| print(f"[{call_id}] Language: {language}") |
|
|
| |
| ai_text = generate_reply(transcript or "Hello", language) |
| print(f"[{call_id}] AI Reply: {ai_text}") |
|
|
| |
| response_mp3 = RESPONSES_DIR / f"{call_id}.mp3" |
| await synthesize_tts(ai_text, language, response_mp3) |
|
|
| |
| log_item = { |
| "call_id": call_id, |
| "created_at": datetime.utcnow().isoformat() + "Z", |
| "phone": phone, |
| "device_id": device_id, |
| "language": language, |
| "meta": json.loads(meta or "{}"), |
| "audio_path": str(raw_audio_path), |
| "transcript": transcript, |
| "reply_text": ai_text, |
| "reply_audio_path": str(response_mp3), |
| } |
| (LOGS_DIR / f"{call_id}.json").write_text( |
| json.dumps(log_item, indent=2, ensure_ascii=False), encoding="utf-8" |
| ) |
|
|
| return JSONResponse({ |
| "call_id": call_id, |
| "transcript": transcript, |
| "reply_text": ai_text, |
| "reply_audio_url": f"/calls/response/{call_id}", |
| "language": language, |
| "language_hint": language_hint or "auto-detected", |
| "stt_model": GROQ_STT_MODEL, |
| }) |
|
|
|
|
| @app.get("/calls/response/{call_id}") |
| def get_response(call_id: str): |
| """React Native app fetches this MP3 and plays it during the call.""" |
| mp3 = RESPONSES_DIR / f"{call_id}.mp3" |
| if not mp3.exists(): |
| return JSONResponse({"error": "not_found"}, status_code=404) |
| return FileResponse(mp3, media_type="audio/mpeg", filename=f"{call_id}.mp3") |
|
|
|
|
| @app.get("/calls/logs") |
| def get_logs(limit: int = 20): |
| """View recent call logs.""" |
| logs = sorted(LOGS_DIR.glob("*.json"), reverse=True)[:limit] |
| return [json.loads(f.read_text(encoding="utf-8")) for f in logs] |
|
|
|
|
| @app.get("/calls/logs/{call_id}") |
| def get_log(call_id: str): |
| """View log for a specific call.""" |
| log_file = LOGS_DIR / f"{call_id}.json" |
| if not log_file.exists(): |
| return JSONResponse({"error": "not_found"}, status_code=404) |
| return json.loads(log_file.read_text(encoding="utf-8")) |