import gradio as gr import os import tempfile import speech_recognition as sr from moviepy.editor import VideoFileClip import cv2 from PIL import Image import pytesseract import nltk from transformers import pipeline # Download NLP models nltk.download("punkt") summarizer = pipeline("summarization") # Audio Transcription def transcribe_audio(audio_path): recognizer = sr.Recognizer() with sr.AudioFile(audio_path) as source: audio = recognizer.record(source) return recognizer.recognize_google(audio) # Extract audio from video def extract_audio(video_path): video = VideoFileClip(video_path) audio_path = "temp_audio.wav" video.audio.write_audiofile(audio_path) return audio_path # Extract key frames from video def extract_frames(video_path, interval=90): # 3 seconds if ~30fps vidcap = cv2.VideoCapture(video_path) success, image = vidcap.read() count = 0 frames = [] while success: if count % interval == 0: filename = f"frame_{count}.jpg" cv2.imwrite(filename, image) frames.append(filename) success, image = vidcap.read() count += 1 return frames[:3] # return top 3 # OCR on images def ocr_text_from_frames(frame_paths): texts = [] for frame in frame_paths: img = Image.open(frame) text = pytesseract.image_to_string(img) texts.append(text) return "\n".join(texts) # Summarize long text def summarize_text(text): chunks = [text[i:i+1000] for i in range(0, len(text), 1000)] summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks] return "\n".join(summaries) # Core function def process_lecture(file): suffix = os.path.splitext(file.name)[-1] with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(file.read()) input_path = tmp.name if suffix in [".mp4", ".mkv", ".avi"]: audio_path = extract_audio(input_path) frames = extract_frames(input_path) slide_text = ocr_text_from_frames(frames) else: audio_path = input_path slide_text = "" try: transcript = transcribe_audio(audio_path) except Exception as e: transcript = f"[Error during transcription: {e}]" full_text = transcript + "\n" + slide_text summary = summarize_text(full_text) if full_text.strip() else "No content to summarize." return transcript, slide_text, summary # Launch Gradio Interface iface = gr.Interface( fn=process_lecture, inputs=gr.File(label="Upload Lecture Audio or Video"), outputs=[ gr.Textbox(label="🎤 Transcript"), gr.Textbox(label="🖼 Slide OCR Text"), gr.Textbox(label="📝 Summary Notes") ], title="Smart Lecture Notes Generator", description="Upload a lecture recording (audio or video). It will transcribe speech, extract slide text via OCR, and generate summarized notes." ) iface.launch()