karthikmn commited on
Commit
8a0fbba
·
verified ·
1 Parent(s): 09d9f61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -93
app.py CHANGED
@@ -1,98 +1,37 @@
1
  import gradio as gr
2
- import os
3
- import tempfile
4
- import speech_recognition as sr
5
- from moviepy.editor import VideoFileClip
6
- import cv2
7
- from PIL import Image
8
- import pytesseract
9
- import nltk
10
- from transformers import pipeline
11
-
12
- # Download NLP models
13
- nltk.download("punkt")
14
- summarizer = pipeline("summarization")
15
-
16
- # Audio Transcription
17
- def transcribe_audio(audio_path):
18
- recognizer = sr.Recognizer()
19
- with sr.AudioFile(audio_path) as source:
20
- audio = recognizer.record(source)
21
- return recognizer.recognize_google(audio)
22
-
23
- # Extract audio from video
24
- def extract_audio(video_path):
25
- video = VideoFileClip(video_path)
26
- audio_path = "temp_audio.wav"
27
- video.audio.write_audiofile(audio_path)
28
- return audio_path
29
-
30
- # Extract key frames from video
31
- def extract_frames(video_path, interval=90): # 3 seconds if ~30fps
32
- vidcap = cv2.VideoCapture(video_path)
33
- success, image = vidcap.read()
34
- count = 0
35
- frames = []
36
- while success:
37
- if count % interval == 0:
38
- filename = f"frame_{count}.jpg"
39
- cv2.imwrite(filename, image)
40
- frames.append(filename)
41
- success, image = vidcap.read()
42
- count += 1
43
- return frames[:3] # return top 3
44
-
45
- # OCR on images
46
- def ocr_text_from_frames(frame_paths):
47
- texts = []
48
- for frame in frame_paths:
49
- img = Image.open(frame)
50
- text = pytesseract.image_to_string(img)
51
- texts.append(text)
52
- return "\n".join(texts)
53
-
54
- # Summarize long text
55
- def summarize_text(text):
56
- chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
57
- summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
58
- return "\n".join(summaries)
59
-
60
- # Core function
61
- def process_lecture(file):
62
- suffix = os.path.splitext(file.name)[-1]
63
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
64
- tmp.write(file.read())
65
- input_path = tmp.name
66
-
67
- if suffix in [".mp4", ".mkv", ".avi"]:
68
- audio_path = extract_audio(input_path)
69
- frames = extract_frames(input_path)
70
- slide_text = ocr_text_from_frames(frames)
71
- else:
72
- audio_path = input_path
73
- slide_text = ""
74
-
75
- try:
76
- transcript = transcribe_audio(audio_path)
77
- except Exception as e:
78
- transcript = f"[Error during transcription: {e}]"
79
-
80
- full_text = transcript + "\n" + slide_text
81
- summary = summarize_text(full_text) if full_text.strip() else "No content to summarize."
82
-
83
- return transcript, slide_text, summary
84
-
85
- # Launch Gradio Interface
86
  iface = gr.Interface(
87
- fn=process_lecture,
88
- inputs=gr.File(label="Upload Lecture Audio or Video"),
89
- outputs=[
90
- gr.Textbox(label="🎤 Transcript"),
91
- gr.Textbox(label="🖼 Slide OCR Text"),
92
- gr.Textbox(label="📝 Summary Notes")
93
- ],
94
- title="Smart Lecture Notes Generator",
95
- description="Upload a lecture recording (audio or video). It will transcribe speech, extract slide text via OCR, and generate summarized notes."
96
  )
97
 
 
98
  iface.launch()
 
1
  import gradio as gr
2
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
3
+ import torch
4
+
5
+ # Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
6
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
7
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
8
+
9
+ # Function to convert speech to text
10
+ def speech_to_text(audio_file):
11
+ # Load the audio file
12
+ audio_input, _ = torchaudio.load(audio_file.name)
13
+
14
+ # Preprocess the audio input (e.g., resample, normalize, etc.)
15
+ input_values = processor(audio_input, return_tensors="pt").input_values
16
+
17
+ # Perform speech-to-text (CTC Decoding)
18
+ with torch.no_grad():
19
+ logits = model(input_values).logits
20
+ predicted_ids = torch.argmax(logits, dim=-1)
21
+
22
+ # Decode the predicted ids to text
23
+ transcription = processor.decode(predicted_ids[0])
24
+
25
+ return transcription
26
+
27
+ # Set up the Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  iface = gr.Interface(
29
+ fn=speech_to_text, # Function to be executed
30
+ inputs=gr.Audio(source="upload", type="file"), # Allow audio file upload
31
+ outputs=gr.Textbox(), # Display transcription in a text box
32
+ title="Speech-to-Text Analyzer for Lecture Notes",
33
+ description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
 
 
 
 
34
  )
35
 
36
+ # Launch the interface
37
  iface.launch()