Spaces:
Build error
Build error
File size: 6,075 Bytes
f59b84b cb031f8 f59b84b cb031f8 f59b84b cb031f8 f59b84b 377c751 d47d003 f59b84b 165b09c f59b84b 165b09c f59b84b cb031f8 f59b84b cb031f8 dd1737f cb031f8 dd1737f 98664d2 cb031f8 7c6deed 98664d2 5ded0bd f59b84b d47d003 f59b84b cb031f8 f59b84b cb031f8 f59b84b cb031f8 f59b84b cb031f8 f59b84b d47d003 f59b84b 377c751 d0bef6c 377c751 d0bef6c 377c751 d0bef6c 5ded0bd 377c751 5ded0bd f59b84b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import time
import logging
import os
import shutil
from pathlib import Path
app = FastAPI()
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class VideoRequest(BaseModel):
url: str
class TranscriptResponse(BaseModel):
success: bool
transcript: list[str] | None
error: str | None
processing_time: float
def init_driver():
options = Options()
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
# Try multiple possible Chrome binary locations
possible_chrome_paths = [
"/usr/bin/google-chrome",
"/usr/bin/google-chrome-stable",
"/usr/lib/chromium-browser/chrome",
"/usr/bin/chromium"
]
chrome_path = None
for path in possible_chrome_paths:
if os.path.exists(path):
chrome_path = path
break
if not chrome_path:
logger.error(f"No Chrome binary found in paths: {possible_chrome_paths}")
raise Exception(f"No Chrome binary found in paths: {possible_chrome_paths}")
options.binary_location = chrome_path
logger.info(f"Using Chrome binary: {chrome_path}")
try:
chromedriver_path = "/usr/bin/chromedriver"
if not os.path.exists(chromedriver_path):
logger.error(f"ChromeDriver not found at {chromedriver_path}")
raise Exception(f"ChromeDriver not found at {chromedriver_path}")
service = Service(executable_path=chromedriver_path)
driver = webdriver.Chrome(service=service, options=options)
logger.info("ChromeDriver initialized successfully")
return driver
except Exception as e:
logger.error(f"Driver initialization failed: {str(e)}")
raise Exception(f"Driver initialization failed: {str(e)}")
@app.post("/transcript", response_model=TranscriptResponse)
async def get_transcript(request: VideoRequest):
start_time = time.time()
driver = None
try:
video_url = request.url
if not ("youtube.com" in video_url or "youtu.be" in video_url):
raise HTTPException(status_code=400, detail="Invalid YouTube URL")
driver = init_driver()
logger.info(f"Processing URL: {video_url}")
driver.get(video_url)
# Handle cookie consent if it appears
try:
cookie_button = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
)
cookie_button.click()
logger.info("Accepted cookies")
except TimeoutException:
logger.info("No cookie consent found")
pass
# Click more button
more_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, "expand"))
)
driver.execute_script("arguments[0].click();", more_button)
# Click transcript button
transcript_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
)
driver.execute_script("arguments[0].click();", transcript_button)
# Wait for transcript
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID, "segments-container"))
)
# Extract transcript
segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
transcript = []
for segment in segments:
try:
text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
if text:
transcript.append(text)
except:
continue
if not transcript:
raise HTTPException(status_code=404, detail="No transcript available")
return TranscriptResponse(
success=True,
transcript=transcript,
error=None,
processing_time=time.time() - start_time
)
except TimeoutException as e:
error_msg = "Timed out waiting for page elements - the video might not have transcripts"
logger.error(error_msg)
return TranscriptResponse(
success=False,
transcript=None,
error=error_msg,
processing_time=time.time() - start_time
)
except Exception as e:
logger.error(f"Error: {str(e)}")
return TranscriptResponse(
success=False,
transcript=None,
error=str(e),
processing_time=time.time() - start_time
)
finally:
if driver:
driver.quit()
app = FastAPI()
@app.get("/health")
def health_check():
chrome_path = shutil.which("google-chrome")
chromedriver_path = shutil.which("chromedriver")
return {
"ChromePath": chrome_path,
"ChromeDriverPath": chromedriver_path,
"ChromeExists": Path(chrome_path or "").exists(),
"ChromeDriverExists": Path(chromedriver_path or "").exists()
}
@app.get("/")
async def root():
return {"message": "Welcome to YouTube Transcript API"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860))) |