hamza2923 commited on
Commit
dd49b8d
·
verified ·
1 Parent(s): abd2efb

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +189 -0
main.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
+ from selenium.webdriver.chrome.options import Options
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.support.ui import WebDriverWait
9
+ from selenium.webdriver.support import expected_conditions as EC
10
+ from selenium.common.exceptions import TimeoutException, WebDriverException
11
+ import time
12
+ import logging
13
+ import os
14
+ import shutil
15
+ from pathlib import Path
16
+
17
+
18
+ app = FastAPI()
19
+
20
+ # Configure CORS
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=["*"],
24
+ allow_credentials=True,
25
+ allow_methods=["*"],
26
+ allow_headers=["*"],
27
+ )
28
+
29
+ # Configure logging
30
+ logging.basicConfig(level=logging.INFO)
31
+ logger = logging.getLogger(__name__)
32
+
33
+ class VideoRequest(BaseModel):
34
+ url: str
35
+
36
+ class TranscriptResponse(BaseModel):
37
+ success: bool
38
+ transcript: list[str] | None
39
+ error: str | None
40
+ processing_time: float
41
+
42
+ def init_driver():
43
+ options = Options()
44
+ options.add_argument("--headless=new")
45
+ options.add_argument("--no-sandbox")
46
+ options.add_argument("--disable-dev-shm-usage")
47
+ options.add_argument("--disable-gpu")
48
+
49
+ # Try multiple possible Chrome binary locations
50
+ possible_chrome_paths = [
51
+ "/usr/bin/google-chrome",
52
+ "/usr/bin/google-chrome-stable",
53
+ "/usr/lib/chromium-browser/chrome",
54
+ "/usr/bin/chromium"
55
+ ]
56
+ chrome_path = None
57
+ for path in possible_chrome_paths:
58
+ if os.path.exists(path):
59
+ chrome_path = path
60
+ break
61
+
62
+ if not chrome_path:
63
+ logger.error(f"No Chrome binary found in paths: {possible_chrome_paths}")
64
+ raise Exception(f"No Chrome binary found in paths: {possible_chrome_paths}")
65
+
66
+ options.binary_location = chrome_path
67
+ logger.info(f"Using Chrome binary: {chrome_path}")
68
+
69
+ try:
70
+ chromedriver_path = "/usr/bin/chromedriver"
71
+ if not os.path.exists(chromedriver_path):
72
+ logger.error(f"ChromeDriver not found at {chromedriver_path}")
73
+ raise Exception(f"ChromeDriver not found at {chromedriver_path}")
74
+
75
+ service = Service(executable_path=chromedriver_path)
76
+ driver = webdriver.Chrome(service=service, options=options)
77
+ logger.info("ChromeDriver initialized successfully")
78
+ return driver
79
+ except Exception as e:
80
+ logger.error(f"Driver initialization failed: {str(e)}")
81
+ raise Exception(f"Driver initialization failed: {str(e)}")
82
+
83
+ @app.post("/transcript", response_model=TranscriptResponse)
84
+ async def get_transcript(request: VideoRequest):
85
+ start_time = time.time()
86
+ driver = None
87
+
88
+ try:
89
+ video_url = request.url
90
+ if not ("youtube.com" in video_url or "youtu.be" in video_url):
91
+ raise HTTPException(status_code=400, detail="Invalid YouTube URL")
92
+
93
+ driver = init_driver()
94
+ logger.info(f"Processing URL: {video_url}")
95
+ driver.get(video_url)
96
+
97
+ # Handle cookie consent if it appears
98
+ try:
99
+ cookie_button = WebDriverWait(driver, 5).until(
100
+ EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
101
+ )
102
+ cookie_button.click()
103
+ logger.info("Accepted cookies")
104
+ except TimeoutException:
105
+ logger.info("No cookie consent found")
106
+ pass
107
+
108
+ # Click more button
109
+ more_button = WebDriverWait(driver, 10).until(
110
+ EC.element_to_be_clickable((By.ID, "expand"))
111
+ )
112
+ driver.execute_script("arguments[0].click();", more_button)
113
+
114
+ # Click transcript button
115
+ transcript_button = WebDriverWait(driver, 10).until(
116
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
117
+ )
118
+ driver.execute_script("arguments[0].click();", transcript_button)
119
+
120
+ # Wait for transcript
121
+ WebDriverWait(driver, 15).until(
122
+ EC.presence_of_element_located((By.ID, "segments-container"))
123
+ )
124
+
125
+ # Extract transcript
126
+ segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
127
+ transcript = []
128
+ for segment in segments:
129
+ try:
130
+ text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
131
+ if text:
132
+ transcript.append(text)
133
+ except:
134
+ continue
135
+
136
+ if not transcript:
137
+ raise HTTPException(status_code=404, detail="No transcript available")
138
+
139
+ return TranscriptResponse(
140
+ success=True,
141
+ transcript=transcript,
142
+ error=None,
143
+ processing_time=time.time() - start_time
144
+ )
145
+
146
+ except TimeoutException as e:
147
+ error_msg = "Timed out waiting for page elements - the video might not have transcripts"
148
+ logger.error(error_msg)
149
+ return TranscriptResponse(
150
+ success=False,
151
+ transcript=None,
152
+ error=error_msg,
153
+ processing_time=time.time() - start_time
154
+ )
155
+ except Exception as e:
156
+ logger.error(f"Error: {str(e)}")
157
+ return TranscriptResponse(
158
+ success=False,
159
+ transcript=None,
160
+ error=str(e),
161
+ processing_time=time.time() - start_time
162
+ )
163
+ finally:
164
+ if driver:
165
+ driver.quit()
166
+
167
+
168
+
169
+ app = FastAPI()
170
+
171
+ @app.get("/health")
172
+ def health_check():
173
+ chrome_path = shutil.which("google-chrome")
174
+ chromedriver_path = shutil.which("chromedriver")
175
+ return {
176
+ "ChromePath": chrome_path,
177
+ "ChromeDriverPath": chromedriver_path,
178
+ "ChromeExists": Path(chrome_path or "").exists(),
179
+ "ChromeDriverExists": Path(chromedriver_path or "").exists()
180
+ }
181
+
182
+
183
+ @app.get("/")
184
+ async def root():
185
+ return {"message": "Welcome to YouTube Transcript API"}
186
+
187
+ if __name__ == "__main__":
188
+ import uvicorn
189
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))