somebeast commited on
Commit
5aa4075
·
verified ·
1 Parent(s): 2218541

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +95 -53
app.py CHANGED
@@ -188,62 +188,108 @@ def _insight(s):
188
 
189
 
190
  # ---- Handlers ----
191
- def score_text_with_chart(text):
192
- if not text or not text.strip(): return "Enter text.", None, ""
193
- try:
194
- s = _predict(text.strip())
195
- return _fmt(s), _radar(s), _insight(s)
196
- except Exception as e:
197
- import traceback
198
- return f"Error: {e}\n{traceback.format_exc()}", None, ""
199
-
200
-
201
  @spaces.GPU(duration=120)
202
- def _transcribe_video(video_path):
203
- """Extract audio from video and transcribe using Whisper."""
204
- import subprocess, tempfile
205
- # Extract audio with ffmpeg
206
- audio_path = tempfile.mktemp(suffix=".wav")
207
  subprocess.run(["ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le",
208
  "-ar", "16000", "-ac", "1", audio_path, "-y"],
209
  capture_output=True, timeout=60)
210
 
211
- # Transcribe with Whisper
212
- try:
213
- import whisper
214
- whisper_model = whisper.load_model("base")
215
- result = whisper_model.transcribe(audio_path)
216
- transcript = result["text"]
217
- except ImportError:
218
- # Fallback: use transformers pipeline
219
- from transformers import pipeline
220
- pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base",
221
- device="cuda", torch_dtype=torch.float16)
222
- result = pipe(audio_path)
223
- transcript = result["text"]
224
- finally:
225
- if os.path.exists(audio_path):
226
- os.unlink(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  torch.cuda.empty_cache()
229
- return transcript
 
 
 
 
 
 
 
 
 
230
 
231
 
232
- def score_video(video):
233
- if video is None: return "Upload a video.", None, ""
234
  try:
235
- # Transcribe video audio
236
- transcript = _transcribe_video(video)
237
- if not transcript or not transcript.strip():
238
- return "Could not extract speech from video.", None, ""
239
-
240
- # Score the transcript
241
- s = _predict(transcript.strip())
242
- scores_text = f"Transcript: {transcript[:200]}{'...' if len(transcript) > 200 else ''}\n\n{_fmt(s)}"
243
- return scores_text, _radar(s, title="Video Brain Engagement"), _insight(s)
244
  except Exception as e:
245
  import traceback
246
- return f"Error: {e}\n{traceback.format_exc()}", None, ""
 
 
 
 
 
 
 
 
 
 
247
 
248
 
249
  def ab_test_safe(a, b):
@@ -277,21 +323,17 @@ with gr.Blocks(title="TRIBE V2 Brain Prediction", theme=gr.themes.Base(
277
  with gr.Tab("📝 Text"):
278
  t_in = gr.Textbox(label="Content", lines=5, placeholder="Paste script or hook...")
279
  t_btn = gr.Button("🧠 Analyze", variant="primary")
280
- with gr.Row():
281
- t_out = gr.Textbox(label="Scores", lines=10)
282
- t_img = gr.Image(label="Brain Radar", type="filepath")
283
  t_ins = gr.Textbox(label="💡 Insight")
284
- t_btn.click(score_text_with_chart, [t_in], [t_out, t_img, t_ins], api_name="predict")
285
 
286
  with gr.Tab("🎬 Video"):
287
- gr.Markdown("Upload a video — audio is transcribed and scored. ~30-60s.")
288
  v_in = gr.Video(label="Upload Video")
289
  v_btn = gr.Button("🧠 Analyze Video", variant="primary")
290
- with gr.Row():
291
- v_out = gr.Textbox(label="Scores", lines=10)
292
- v_img = gr.Image(label="Brain Radar", type="filepath")
293
  v_ins = gr.Textbox(label="💡 Insight")
294
- v_btn.click(score_video, [v_in], [v_out, v_img, v_ins], api_name="predict_video")
295
 
296
  with gr.Tab("⚔️ A/B Test"):
297
  with gr.Row():
 
188
 
189
 
190
  # ---- Handlers ----
 
 
 
 
 
 
 
 
 
 
191
  @spaces.GPU(duration=120)
192
+ def _transcribe_and_score(video_path):
193
+ """Extract audio, transcribe with Whisper, then score with Phi-2."""
194
+ import subprocess
195
+ # Extract audio
196
+ audio_path = os.path.join(os.path.dirname(video_path), "audio_extract.wav")
197
  subprocess.run(["ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le",
198
  "-ar", "16000", "-ac", "1", audio_path, "-y"],
199
  capture_output=True, timeout=60)
200
 
201
+ # Transcribe
202
+ import whisper
203
+ whisper_model = whisper.load_model("base", device="cuda")
204
+ result = whisper_model.transcribe(audio_path)
205
+ transcript = result["text"]
206
+
207
+ if os.path.exists(audio_path):
208
+ os.unlink(audio_path)
209
+
210
+ if not transcript or not transcript.strip():
211
+ raise ValueError("No speech detected in video")
212
+
213
+ # Score transcript using Phi-2
214
+ m = ensure_model()
215
+ tok = m["tokenizer"]
216
+ llm = m["model"].cuda().half()
217
+ inputs = tok(transcript, return_tensors="pt", truncation=True, max_length=512).to("cuda")
218
+ with torch.inference_mode():
219
+ outputs = llm(**inputs)
220
+
221
+ logits = outputs.logits
222
+ hidden = outputs.hidden_states[-1]
223
+
224
+ shift_logits = logits[:, :-1, :].contiguous()
225
+ shift_labels = inputs["input_ids"][:, 1:].contiguous()
226
+ losses = torch.nn.CrossEntropyLoss(reduction="none")(
227
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
228
+ perplexity = float(torch.exp(losses.mean()).cpu())
229
+ attention_raw = min(perplexity / 30.0, 1.0)
230
+
231
+ ids = inputs["input_ids"][0].cpu().tolist()
232
+ language_raw = len(set(ids)) / max(len(ids), 1)
233
+
234
+ hn = hidden.squeeze().cpu().float().numpy()
235
+ norms = np.linalg.norm(hn, axis=1)
236
+ emotion_raw = float(np.std(norms) / (np.mean(norms) + 1e-8))
237
+
238
+ tl = transcript.lower()
239
+ nums = sum(c.isdigit() for c in transcript) / max(len(transcript), 1)
240
+ caps = sum(c.isupper() for c in transcript) / max(len(transcript), 1)
241
+ urgency = sum(1 for w in ["now", "shock", "destroy", "change", "secret",
242
+ "never", "always", "must", "urgent", "breaking", "exclusive", "free",
243
+ "fastest", "cheapest", "worst", "best", "insane", "crazy"] if w in tl)
244
+ visual_raw = min(nums * 10 + caps * 5 + urgency * 0.15, 1.0)
245
+
246
+ words = tl.split()
247
+ personal = sum(1 for w in words if w in ["i", "me", "my", "you", "your", "we", "our"])
248
+ dm_raw = min(personal / max(len(words), 1) * 5, 1.0)
249
+
250
+ def sig(v, c=0.3, s=8.0):
251
+ return float(100.0 / (1.0 + np.exp(-s * (max(0, min(1, v)) - c))))
252
+
253
+ att = sig(attention_raw, 0.25, 6.0)
254
+ emo = sig(emotion_raw, 0.15, 10.0)
255
+ lang = sig(language_raw, 0.5, 8.0)
256
+ vis = sig(visual_raw, 0.2, 8.0)
257
+ dm = sig(dm_raw, 0.2, 6.0)
258
+ overall = (att + emo + lang + vis + dm) / 5.0
259
+ viral = att * 0.4 + emo * 0.4 + vis * 0.2
260
 
261
  torch.cuda.empty_cache()
262
+ return transcript, {
263
+ "overall_brain_engagement": round(overall, 1),
264
+ "viral_potential": round(viral, 1),
265
+ "attention_capture": round(att, 1),
266
+ "emotional_valence": round(emo, 1),
267
+ "language_processing": round(lang, 1),
268
+ "visual_imagery": round(vis, 1),
269
+ "hook_effectiveness": round(att, 1),
270
+ "retention_prediction": round(min(lang / max(att, 1) * 100, 100), 1),
271
+ }
272
 
273
 
274
+ def score_video_safe(video):
275
+ if video is None: return "Upload a video.", ""
276
  try:
277
+ transcript, s = _transcribe_and_score(video)
278
+ preview = transcript[:300] + ("..." if len(transcript) > 300 else "")
279
+ return f"Transcript:\n{preview}\n\n{_fmt(s)}", _insight(s)
 
 
 
 
 
 
280
  except Exception as e:
281
  import traceback
282
+ return f"Error: {e}\n{traceback.format_exc()}", ""
283
+
284
+
285
+ def score_text_safe(text):
286
+ if not text or not text.strip(): return "Enter text.", ""
287
+ try:
288
+ s = _predict(text.strip())
289
+ return _fmt(s), _insight(s)
290
+ except Exception as e:
291
+ import traceback
292
+ return f"Error: {e}\n{traceback.format_exc()}", ""
293
 
294
 
295
  def ab_test_safe(a, b):
 
323
  with gr.Tab("📝 Text"):
324
  t_in = gr.Textbox(label="Content", lines=5, placeholder="Paste script or hook...")
325
  t_btn = gr.Button("🧠 Analyze", variant="primary")
326
+ t_out = gr.Textbox(label="Scores", lines=10)
 
 
327
  t_ins = gr.Textbox(label="💡 Insight")
328
+ t_btn.click(score_text_safe, [t_in], [t_out, t_ins], api_name="predict")
329
 
330
  with gr.Tab("🎬 Video"):
331
+ gr.Markdown("Upload a video — audio is transcribed and scored. ~30-60s on GPU.")
332
  v_in = gr.Video(label="Upload Video")
333
  v_btn = gr.Button("🧠 Analyze Video", variant="primary")
334
+ v_out = gr.Textbox(label="Scores", lines=12)
 
 
335
  v_ins = gr.Textbox(label="💡 Insight")
336
+ v_btn.click(score_video_safe, [v_in], [v_out, v_ins], api_name="predict_video")
337
 
338
  with gr.Tab("⚔️ A/B Test"):
339
  with gr.Row():