hamza2923 commited on
Commit
cb031f8
·
verified ·
1 Parent(s): 37177e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -13
app.py CHANGED
@@ -2,11 +2,12 @@ from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from selenium import webdriver
5
- from selenium.webdriver.common.by import By
6
  from selenium.webdriver.chrome.options import Options
 
7
  from selenium.webdriver.support.ui import WebDriverWait
8
  from selenium.webdriver.support import expected_conditions as EC
9
- from selenium.common.exceptions import TimeoutException
10
  import time
11
  import logging
12
  import os
@@ -37,17 +38,24 @@ class TranscriptResponse(BaseModel):
37
 
38
  def init_driver():
39
  options = Options()
40
- options.add_argument("--headless")
41
- options.add_argument("--disable-gpu")
42
  options.add_argument("--no-sandbox")
43
  options.add_argument("--disable-dev-shm-usage")
44
- options.add_argument("--log-level=3")
45
- options.add_experimental_option("excludeSwitches", ["enable-automation"])
46
- options.add_experimental_option('useAutomationExtension', False)
47
 
48
- # For Hugging Face Spaces
49
  options.binary_location = "/usr/bin/google-chrome"
50
- return webdriver.Chrome(options=options)
 
 
 
 
 
 
 
 
 
51
 
52
  @app.post("/transcript", response_model=TranscriptResponse)
53
  async def get_transcript(request: VideoRequest):
@@ -71,6 +79,7 @@ async def get_transcript(request: VideoRequest):
71
  cookie_button.click()
72
  logger.info("Accepted cookies")
73
  except TimeoutException:
 
74
  pass
75
 
76
  # Click more button
@@ -92,8 +101,14 @@ async def get_transcript(request: VideoRequest):
92
 
93
  # Extract transcript
94
  segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
95
- transcript = [segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
96
- for segment in segments if segment.find_element(By.CLASS_NAME, "segment-text").text.strip()]
 
 
 
 
 
 
97
 
98
  if not transcript:
99
  raise HTTPException(status_code=404, detail="No transcript available")
@@ -106,11 +121,12 @@ async def get_transcript(request: VideoRequest):
106
  )
107
 
108
  except TimeoutException as e:
109
- logger.error(f"Timeout: {str(e)}")
 
110
  return TranscriptResponse(
111
  success=False,
112
  transcript=None,
113
- error="Timed out waiting for page elements",
114
  processing_time=time.time() - start_time
115
  )
116
  except Exception as e:
 
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
  from selenium.webdriver.chrome.options import Options
7
+ from selenium.webdriver.common.by import By
8
  from selenium.webdriver.support.ui import WebDriverWait
9
  from selenium.webdriver.support import expected_conditions as EC
10
+ from selenium.common.exceptions import TimeoutException, WebDriverException
11
  import time
12
  import logging
13
  import os
 
38
 
39
  def init_driver():
40
  options = Options()
41
+ options.add_argument("--headless=new")
 
42
  options.add_argument("--no-sandbox")
43
  options.add_argument("--disable-dev-shm-usage")
44
+ options.add_argument("--disable-gpu")
45
+ options.add_argument("--window-size=1280,1696")
 
46
 
47
+ # Chrome binary location in Hugging Face Spaces
48
  options.binary_location = "/usr/bin/google-chrome"
49
+
50
+ # ChromeDriver path
51
+ service = Service(executable_path="/usr/bin/chromedriver")
52
+
53
+ try:
54
+ driver = webdriver.Chrome(service=service, options=options)
55
+ return driver
56
+ except WebDriverException as e:
57
+ logger.error(f"Driver initialization failed: {str(e)}")
58
+ raise
59
 
60
  @app.post("/transcript", response_model=TranscriptResponse)
61
  async def get_transcript(request: VideoRequest):
 
79
  cookie_button.click()
80
  logger.info("Accepted cookies")
81
  except TimeoutException:
82
+ logger.info("No cookie consent found")
83
  pass
84
 
85
  # Click more button
 
101
 
102
  # Extract transcript
103
  segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
104
+ transcript = []
105
+ for segment in segments:
106
+ try:
107
+ text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
108
+ if text:
109
+ transcript.append(text)
110
+ except:
111
+ continue
112
 
113
  if not transcript:
114
  raise HTTPException(status_code=404, detail="No transcript available")
 
121
  )
122
 
123
  except TimeoutException as e:
124
+ error_msg = "Timed out waiting for page elements - the video might not have transcripts"
125
+ logger.error(error_msg)
126
  return TranscriptResponse(
127
  success=False,
128
  transcript=None,
129
+ error=error_msg,
130
  processing_time=time.time() - start_time
131
  )
132
  except Exception as e: