hamza2923 commited on
Commit
2d8eb22
·
verified ·
1 Parent(s): c6a5ba1

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +186 -0
main.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
+ from selenium.webdriver.chrome.options import Options
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.support.ui import WebDriverWait
9
+ from selenium.webdriver.support import expected_conditions as EC
10
+ from selenium.common.exceptions import TimeoutException
11
+ import time
12
+ import logging
13
+ import os
14
+ import shutil
15
+ from pathlib import Path
16
+
17
+ app = FastAPI()
18
+
19
+ # Configure CORS
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=["*"],
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+ # Configure logging
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Pydantic models
33
+ class VideoRequest(BaseModel):
34
+ url: str
35
+
36
+ class TranscriptResponse(BaseModel):
37
+ success: bool
38
+ transcript: list[str] | None
39
+ error: str | None
40
+ processing_time: float
41
+
42
+ # Driver init and route handlers here...
43
+
44
+ def init_driver():
45
+ options = Options()
46
+ options.add_argument("--headless=new")
47
+ options.add_argument("--no-sandbox")
48
+ options.add_argument("--disable-dev-shm-usage")
49
+ options.add_argument("--disable-gpu")
50
+
51
+ # Try multiple possible Chrome binary locations
52
+ possible_chrome_paths = [
53
+ "/usr/bin/google-chrome",
54
+ "/usr/bin/google-chrome-stable",
55
+ "/usr/lib/chromium-browser/chrome",
56
+ "/usr/bin/chromium"
57
+ ]
58
+ chrome_path = None
59
+ for path in possible_chrome_paths:
60
+ if os.path.exists(path):
61
+ chrome_path = path
62
+ break
63
+
64
+ if not chrome_path:
65
+ logger.error(f"No Chrome binary found in paths: {possible_chrome_paths}")
66
+ raise Exception(f"No Chrome binary found in paths: {possible_chrome_paths}")
67
+
68
+ options.binary_location = chrome_path
69
+ logger.info(f"Using Chrome binary: {chrome_path}")
70
+
71
+ try:
72
+ chromedriver_path = "/usr/bin/chromedriver"
73
+ if not os.path.exists(chromedriver_path):
74
+ logger.error(f"ChromeDriver not found at {chromedriver_path}")
75
+ raise Exception(f"ChromeDriver not found at {chromedriver_path}")
76
+
77
+ service = Service(executable_path=chromedriver_path)
78
+ driver = webdriver.Chrome(service=service, options=options)
79
+ logger.info("ChromeDriver initialized successfully")
80
+ return driver
81
+ except Exception as e:
82
+ logger.error(f"Driver initialization failed: {str(e)}")
83
+ raise Exception(f"Driver initialization failed: {str(e)}")
84
+
85
+ @app.post("/transcript", response_model=TranscriptResponse)
86
+ async def get_transcript(request: VideoRequest):
87
+ start_time = time.time()
88
+ driver = None
89
+
90
+ try:
91
+ video_url = request.url
92
+ if not ("youtube.com" in video_url or "youtu.be" in video_url):
93
+ raise HTTPException(status_code=400, detail="Invalid YouTube URL")
94
+
95
+ driver = init_driver()
96
+ logger.info(f"Processing URL: {video_url}")
97
+ driver.get(video_url)
98
+
99
+ # Handle cookie consent if it appears
100
+ try:
101
+ cookie_button = WebDriverWait(driver, 5).until(
102
+ EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
103
+ )
104
+ cookie_button.click()
105
+ logger.info("Accepted cookies")
106
+ except TimeoutException:
107
+ logger.info("No cookie consent found")
108
+ pass
109
+
110
+ # Click more button
111
+ more_button = WebDriverWait(driver, 10).until(
112
+ EC.element_to_be_clickable((By.ID, "expand"))
113
+ )
114
+ driver.execute_script("arguments[0].click();", more_button)
115
+
116
+ # Click transcript button
117
+ transcript_button = WebDriverWait(driver, 10).until(
118
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
119
+ )
120
+ driver.execute_script("arguments[0].click();", transcript_button)
121
+
122
+ # Wait for transcript
123
+ WebDriverWait(driver, 15).until(
124
+ EC.presence_of_element_located((By.ID, "segments-container"))
125
+ )
126
+
127
+ # Extract transcript
128
+ segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
129
+ transcript = []
130
+ for segment in segments:
131
+ try:
132
+ text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
133
+ if text:
134
+ transcript.append(text)
135
+ except:
136
+ continue
137
+
138
+ if not transcript:
139
+ raise HTTPException(status_code=404, detail="No transcript available")
140
+
141
+ return TranscriptResponse(
142
+ success=True,
143
+ transcript=transcript,
144
+ error=None,
145
+ processing_time=time.time() - start_time
146
+ )
147
+
148
+ except TimeoutException as e:
149
+ error_msg = "Timed out waiting for page elements - the video might not have transcripts"
150
+ logger.error(error_msg)
151
+ return TranscriptResponse(
152
+ success=False,
153
+ transcript=None,
154
+ error=error_msg,
155
+ processing_time=time.time() - start_time
156
+ )
157
+ except Exception as e:
158
+ logger.error(f"Error: {str(e)}")
159
+ return TranscriptResponse(
160
+ success=False,
161
+ transcript=None,
162
+ error=str(e),
163
+ processing_time=time.time() - start_time
164
+ )
165
+ finally:
166
+ if driver:
167
+ driver.quit()
168
+
169
+ @app.get("/health")
170
+ def health_check():
171
+ chrome_path = shutil.which("google-chrome")
172
+ chromedriver_path = shutil.which("chromedriver")
173
+ return {
174
+ "ChromePath": chrome_path,
175
+ "ChromeDriverPath": chromedriver_path,
176
+ "ChromeExists": Path(chrome_path or "").exists(),
177
+ "ChromeDriverExists": Path(chromedriver_path or "").exists()
178
+ }
179
+
180
+ @app.get("/")
181
+ async def root():
182
+ return {"message": "Welcome to YouTube Transcript API"}
183
+
184
+ if __name__ == "__main__":
185
+ import uvicorn
186
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))