| from fastapi import FastAPI, File, UploadFile, Form |
| from fastapi.responses import HTMLResponse, JSONResponse |
| import torch |
| import torchaudio |
| from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForCausalLM |
|
|
| app = FastAPI() |
|
|
| |
| SPEECH_MODEL = "ibm-granite/granite-speech-3.3-8b" |
| LLM_MODEL = "ibm-granite/granite-3.3-8b-instruct" |
|
|
| speech_processor = AutoProcessor.from_pretrained(SPEECH_MODEL) |
| speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(SPEECH_MODEL).to("cpu") |
| tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL) |
| llm_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL).to("cpu") |
|
|
| @app.get("/", response_class=HTMLResponse) |
| def home(): |
| return """ |
| <h1>Meeting Memory Workflow Automation</h1> |
| <form action="/transcribe" enctype="multipart/form-data" method="post"> |
| <input type="file" name="audiofile" accept="audio/*" required/> |
| <button type="submit">Upload & Transcribe</button> |
| </form> |
| """ |
|
|
| @app.post("/transcribe") |
| async def transcribe(audiofile: UploadFile = File(...)): |
| audio_bytes = await audiofile.read() |
| import io |
| wav = io.BytesIO(audio_bytes) |
| audio, sample_rate = torchaudio.load(wav) |
|
|
| inputs = speech_processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(speech_model.device) |
| generated_ids = speech_model.generate(**inputs) |
| transcript = speech_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
| |
| prompt = f"Summarize the following text: {transcript}" |
| inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device) |
| summary_ids = llm_model.generate(**inputs, max_new_tokens=200) |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True).replace(prompt, "").strip() |
|
|
| html = f""" |
| <h2>Transcript</h2> |
| <pre>{transcript}</pre> |
| <h2>Summary</h2> |
| <pre>{summary}</pre> |
| <a href="/">Back</a> |
| """ |
| return HTMLResponse(content=html) |
|
|
| |