karthikmn's picture
Create app.py
6d5e888 verified
import gradio as gr
import os
import tempfile
import speech_recognition as sr
from moviepy.editor import VideoFileClip
import cv2
from PIL import Image
import pytesseract
import nltk
from transformers import pipeline
# Download NLP models
nltk.download("punkt")
summarizer = pipeline("summarization")
# Audio Transcription
def transcribe_audio(audio_path):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_path) as source:
audio = recognizer.record(source)
return recognizer.recognize_google(audio)
# Extract audio from video
def extract_audio(video_path):
video = VideoFileClip(video_path)
audio_path = "temp_audio.wav"
video.audio.write_audiofile(audio_path)
return audio_path
# Extract key frames from video
def extract_frames(video_path, interval=90): # 3 seconds if ~30fps
vidcap = cv2.VideoCapture(video_path)
success, image = vidcap.read()
count = 0
frames = []
while success:
if count % interval == 0:
filename = f"frame_{count}.jpg"
cv2.imwrite(filename, image)
frames.append(filename)
success, image = vidcap.read()
count += 1
return frames[:3] # return top 3
# OCR on images
def ocr_text_from_frames(frame_paths):
texts = []
for frame in frame_paths:
img = Image.open(frame)
text = pytesseract.image_to_string(img)
texts.append(text)
return "\n".join(texts)
# Summarize long text
def summarize_text(text):
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
return "\n".join(summaries)
# Core function
def process_lecture(file):
suffix = os.path.splitext(file.name)[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(file.read())
input_path = tmp.name
if suffix in [".mp4", ".mkv", ".avi"]:
audio_path = extract_audio(input_path)
frames = extract_frames(input_path)
slide_text = ocr_text_from_frames(frames)
else:
audio_path = input_path
slide_text = ""
try:
transcript = transcribe_audio(audio_path)
except Exception as e:
transcript = f"[Error during transcription: {e}]"
full_text = transcript + "\n" + slide_text
summary = summarize_text(full_text) if full_text.strip() else "No content to summarize."
return transcript, slide_text, summary
# Launch Gradio Interface
iface = gr.Interface(
fn=process_lecture,
inputs=gr.File(label="Upload Lecture Audio or Video"),
outputs=[
gr.Textbox(label="🎀 Transcript"),
gr.Textbox(label="πŸ–Ό Slide OCR Text"),
gr.Textbox(label="πŸ“ Summary Notes")
],
title="Smart Lecture Notes Generator",
description="Upload a lecture recording (audio or video). It will transcribe speech, extract slide text via OCR, and generate summarized notes."
)
iface.launch()