from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi.responses import StreamingResponse from fastapi.middleware.cors import CORSMiddleware import google.generativeai as genai import pdfplumber import json import re import os import tempfile import shutil from gtts import gTTS from pydub import AudioSegment from io import BytesIO app = FastAPI() # CORS app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.on_event("startup") def startup_event(): api_key = os.environ.get("GOOGLE_API_KEY") if api_key: genai.configure(api_key=api_key) else: print("Warning: GOOGLE_API_KEY not found") def extract_text_from_pdf(file_bytes): text = "" with pdfplumber.open(file_bytes) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text def generate_conversation(pdf_text): model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25') output_format = """ [ {"Emily": "..."}, {"Bob": "..."} ] """ query = f""" You are the expert conversation generator for the JEE student based on provided inputs.Your task is to generate the incentive conversation between Emily and her friend Bob explaining ALL the concepts to each others in *DETAILS*. The content to use to generate the conversations: {pdf_text} ----------------------------------------------------------------------- **NOTE**: - Do not include ```json anywere. - All points in the givent content should be explained with details in output conversation. - **Some dialog should contain filler words only**.Do not limit the conversation. - The conversation should inlcudes filler words such as umm, yahh,etc. at proper places specially for Emily. - The conversation will be read by tts so make it very easy and accurate to read. - The formulas should be accuratly read by tts. - It should include pauses, emphasizes, and similar emotions. - All the topics in the given content should be coverd with bettere and detailed explanations in the output disscusion. - Make conversation with significant length so that all the concepts should be covered without fail. - The listner should understand the concepts in the given content easily by listening to the conversation between Bob and Emily. - The conversation should be filled with pleasure , emotions, and all. - All contents given to you should be completly explained to listner by hering the convesations. The output format should strictly follow this output format: {output_format} Strictly follow the provided output format and do *not* include extra intro or '''dot heading. OutPut Format Rules : Rules: 1. **Ensure the JSON is syntactically correct** before responding. 2. Do not include markdown (```json). 3. Verify there are no extra commas, missing brackets, or incorrect types. 4. Respond **only with the JSON** (no explanations) """ response = model.generate_content(query) cleaned_text = response.text.strip("```").strip() cleaned_text = re.sub(r"^json", "", cleaned_text, flags=re.IGNORECASE).strip() cleaned_text = re.sub(r",\s*([\]}])", r"\1", cleaned_text) try: return json.loads(cleaned_text) except json.JSONDecodeError as e: print(f"JSON Error: {e}") raise ValueError("Failed to parse generated conversation") def create_audio_stream(conversation): def generate_female_voice(text): tts = gTTS(text=text, lang='en') buf = BytesIO() tts.write_to_fp(buf) buf.seek(0) return AudioSegment.from_file(buf, format="mp3") def generate_male_voice(text): tts = gTTS(text=text, lang='en') buf = BytesIO() tts.write_to_fp(buf) buf.seek(0) sound = AudioSegment.from_file(buf, format="mp3") lower_pitch = sound._spawn(sound.raw_data, overrides={ "frame_rate": int(sound.frame_rate * 0.85) }).set_frame_rate(sound.frame_rate) return lower_pitch speaker_voice_map = {"Emily": "female", "Bob": "male"} final_audio = AudioSegment.silent(duration=1000) for i, line_dict in enumerate(conversation): for speaker, line in line_dict.items(): voice_type = speaker_voice_map.get(speaker, "female") if voice_type == "female": voice = generate_female_voice(line) else: voice = generate_male_voice(line) final_audio += voice + AudioSegment.silent(duration=500) output_bytes = BytesIO() final_audio.export(output_bytes, format="mp3") output_bytes.seek(0) return output_bytes @app.post("/convert/") async def convert_pdf_to_audio(file: UploadFile = File(...)): try: file_bytes = BytesIO(await file.read()) text = extract_text_from_pdf(file_bytes) if not text.strip(): raise HTTPException(status_code=400, detail="No text extracted from PDF") conversation = generate_conversation(text) audio_stream = create_audio_stream(conversation) return StreamingResponse( audio_stream, media_type="audio/mpeg", headers={"Content-Disposition": f"attachment; filename=audio_{file.filename.split('.')[0]}.mp3"} ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") def health_check(): return {"status": "healthy"} if __name__ == "__main__": import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)