Mpavan45 commited on
Commit
5c217b4
Β·
verified Β·
1 Parent(s): 89df600

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -59
app.py CHANGED
@@ -7,9 +7,61 @@ from sklearn.metrics.pairwise import cosine_similarity
7
  from transformers import pipeline
8
  import os
9
  import tempfile
10
- import math
11
-
12
- # Hugging Face Whisper Model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-base")
14
 
15
  # Function to extract audio from video
@@ -18,92 +70,81 @@ def extract_audio(video_file, audio_file):
18
  output_file = ffmpeg.output(input_file, audio_file, **{'vn': None, 'ar': 16000, 'ac': 1, 'f': 'wav'})
19
  ffmpeg.run(output_file)
20
 
21
- # Function to transcribe audio using Hugging Face Whisper
22
  def transcribe_audio(audio_file):
23
  result = whisper_model(audio_file)
24
  return result['text']
25
 
26
- # Function to generate embeddings
27
  def generate_embedding(text):
28
  model = SentenceTransformer('all-MiniLM-L6-v2')
29
  return model.encode(text).tolist()
30
 
31
- # Function to split large CSV into smaller chunks
32
- def split_csv(csv_path, chunk_size=50000):
33
- chunks = []
34
- for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=chunk_size)):
35
- chunk_file = f"chunk_{i}.csv"
36
- chunk.to_csv(chunk_file, index=False)
37
- chunks.append(chunk_file)
38
- return chunks
39
-
40
- # Function to search subtitles in chunks
41
- def search_in_chunks(transcribed_text, chunk_files, top_k=10):
42
- query_embedding = np.array(generate_embedding(transcribed_text)).reshape(1, -1)
43
- results = []
44
-
45
- for chunk_file in chunk_files:
46
- chunk = pd.read_csv(chunk_file)
47
- subtitle_embeddings = np.array([np.array(eval(e)) for e in chunk['embedding'].tolist()])
48
-
49
- # Calculate cosine similarity
50
- similarities = cosine_similarity(query_embedding, subtitle_embeddings).flatten()
51
-
52
- for idx, similarity in enumerate(similarities):
53
- results.append({
54
- "text": chunk.iloc[idx]['text'],
55
- "cosine_similarity": similarity
56
- })
57
-
58
- # Sort results by similarity score
59
- results = sorted(results, key=lambda x: x['cosine_similarity'], reverse=True)[:top_k]
60
- return results
61
 
62
  # Streamlit UI
63
- st.title("πŸŽ₯ Video Subtitle Search with Hugging Face Whisper and Chunking")
64
 
65
  # Upload video
66
  uploaded_file = st.file_uploader("Upload a video", type=["mp4", "avi", "mov", "mkv"])
67
 
68
- if uploaded_file:
69
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
70
- temp_video.write(uploaded_file.getbuffer())
71
- video_path = temp_video.name
 
 
 
 
72
 
73
- audio_path = "temp_audio.wav"
74
 
75
  # Extract audio
76
  st.info("Extracting audio...")
77
  extract_audio(video_path, audio_path)
78
 
79
- # Transcribe audio using Hugging Face Whisper
80
  st.info("Transcribing audio...")
81
  transcribed_text = transcribe_audio(audio_path)
82
  st.text_area("Transcribed Text", transcribed_text, height=150)
83
 
84
- # Split the large CSV database into smaller chunks
85
- st.info("Splitting large database into smaller chunks...")
86
- subtitle_db_path = "database.csv"
87
- chunk_files = split_csv(subtitle_db_path, chunk_size=50000)
88
-
89
- # Search subtitles in chunks
90
- st.info("Searching subtitles in chunks...")
91
- matching_subtitles = search_in_chunks(transcribed_text, chunk_files)
92
 
93
  # Display video
94
  st.video(video_path)
95
 
96
- # Display matching subtitles with similarity scores
97
- st.subheader("πŸ“œ Matching Subtitles (Chunking + Cosine Similarity)")
98
  for sub in matching_subtitles:
99
  st.write(f"**Subtitle:** {sub['text']}")
100
  st.write(f"**Cosine Similarity:** {sub['cosine_similarity']:.4f}")
101
  st.write("---")
102
 
103
- # Cleanup
104
- os.remove(video_path)
105
- os.remove(audio_path)
106
-
107
- # Remove chunk files
108
- for chunk_file in chunk_files:
109
- os.remove(chunk_file)
 
7
  from transformers import pipeline
8
  import os
9
  import tempfile
10
+ import shutil
11
+ import chromadb
12
+
13
+ # Initialize Chroma DB client
14
+ client = chromadb.Client()
15
+
16
+ # Sidebar for CSV Upload and Permanent Save
17
+ st.sidebar.title("πŸ“‚ Upload CSV File")
18
+ csv_file = st.sidebar.file_uploader("Choose a CSV file", type=["csv"])
19
+
20
+ # Save CSV permanently
21
+ def save_csv_permanently(uploaded_file):
22
+ save_path = os.path.join(os.getcwd(), "permanent_subtitle_data.csv")
23
+ with open(save_path, "wb") as f:
24
+ f.write(uploaded_file.getbuffer())
25
+ return save_path
26
+
27
+ # Load the CSV into Chroma DB
28
+ def load_csv_to_chroma(csv_path):
29
+ df = pd.read_csv(csv_path)
30
+
31
+ # Ensure the embedding column is properly formatted
32
+ df['embedding'] = df['embedding'].apply(lambda x: np.array(eval(x)).tolist())
33
+
34
+ # Create Chroma collection
35
+ collection_name = "video_subtitles"
36
+ if collection_name in [col.name for col in client.list_collections()]:
37
+ client.delete_collection(name=collection_name)
38
+
39
+ collection = client.create_collection(name=collection_name)
40
+
41
+ # Add data to Chroma DB
42
+ for i, row in df.iterrows():
43
+ collection.add(
44
+ ids=[str(i)],
45
+ documents=[row['text']],
46
+ embeddings=[row['embedding']]
47
+ )
48
+
49
+ return collection
50
+
51
+ # Handle CSV upload and save permanently
52
+ if csv_file:
53
+ st.sidebar.success("CSV uploaded successfully!")
54
+
55
+ # Save CSV permanently
56
+ csv_path = save_csv_permanently(csv_file)
57
+ st.sidebar.success(f"CSV saved permanently at: {csv_path}")
58
+
59
+ # Load into Chroma DB
60
+ with st.spinner("Loading CSV into Chroma DB..."):
61
+ collection = load_csv_to_chroma(csv_path)
62
+ st.sidebar.success("CSV loaded into Chroma DB βœ…")
63
+
64
+ # Whisper model for transcription
65
  whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-base")
66
 
67
  # Function to extract audio from video
 
70
  output_file = ffmpeg.output(input_file, audio_file, **{'vn': None, 'ar': 16000, 'ac': 1, 'f': 'wav'})
71
  ffmpeg.run(output_file)
72
 
73
+ # Function to transcribe audio
74
  def transcribe_audio(audio_file):
75
  result = whisper_model(audio_file)
76
  return result['text']
77
 
78
+ # Generate embeddings for the transcription
79
  def generate_embedding(text):
80
  model = SentenceTransformer('all-MiniLM-L6-v2')
81
  return model.encode(text).tolist()
82
 
83
+ # Search subtitles in Chroma DB
84
+ def search_in_chroma(transcribed_text, collection, top_k=10):
85
+ query_embedding = np.array(generate_embedding(transcribed_text))
86
+
87
+ # Query Chroma DB
88
+ results = collection.query(
89
+ query_embeddings=[query_embedding.tolist()],
90
+ n_results=top_k
91
+ )
92
+
93
+ # Prepare results with cosine similarity
94
+ subtitles = []
95
+ for i, doc in enumerate(results['documents'][0]):
96
+ embedding = np.array(results['embeddings'][0][i])
97
+ similarity = cosine_similarity([query_embedding], [embedding])[0][0]
98
+
99
+ subtitles.append({
100
+ "text": doc,
101
+ "cosine_similarity": similarity
102
+ })
103
+
104
+ # Sort results by similarity
105
+ subtitles = sorted(subtitles, key=lambda x: x['cosine_similarity'], reverse=True)
106
+ return subtitles
 
 
 
 
 
 
107
 
108
  # Streamlit UI
109
+ st.title("πŸŽ₯ Video Subtitle Search with Chroma DB")
110
 
111
  # Upload video
112
  uploaded_file = st.file_uploader("Upload a video", type=["mp4", "avi", "mov", "mkv"])
113
 
114
+ if uploaded_file and csv_file:
115
+ # Create temporary directory
116
+ temp_dir = tempfile.mkdtemp()
117
+
118
+ # Save video temporarily
119
+ video_path = os.path.join(temp_dir, "temp_video.mp4")
120
+ with open(video_path, "wb") as f:
121
+ f.write(uploaded_file.getbuffer())
122
 
123
+ audio_path = os.path.join(temp_dir, "temp_audio.wav")
124
 
125
  # Extract audio
126
  st.info("Extracting audio...")
127
  extract_audio(video_path, audio_path)
128
 
129
+ # Transcribe audio
130
  st.info("Transcribing audio...")
131
  transcribed_text = transcribe_audio(audio_path)
132
  st.text_area("Transcribed Text", transcribed_text, height=150)
133
 
134
+ # Search in Chroma DB
135
+ st.info("Searching subtitles in Chroma DB...")
136
+ matching_subtitles = search_in_chroma(transcribed_text, collection)
 
 
 
 
 
137
 
138
  # Display video
139
  st.video(video_path)
140
 
141
+ # Display matching subtitles
142
+ st.subheader("πŸ“œ Matching Subtitles with Cosine Similarity")
143
  for sub in matching_subtitles:
144
  st.write(f"**Subtitle:** {sub['text']}")
145
  st.write(f"**Cosine Similarity:** {sub['cosine_similarity']:.4f}")
146
  st.write("---")
147
 
148
+ # Cleanup temporary files and directory
149
+ shutil.rmtree(temp_dir)
150
+ st.success("Temporary files cleaned up successfully βœ…")