| from fastapi import FastAPI, HTTPException |
| from fastapi.middleware.cors import CORSMiddleware |
| from scraper import Scraper |
|
|
|
|
| try: from pip._internal.operations import freeze |
| except ImportError: |
| from pip.operations import freeze |
|
|
| pkgs = freeze.freeze() |
| for pkg in pkgs: print(pkg) |
|
|
| app = FastAPI() |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| def get_links(soup): |
| links = [] |
| for link in soup.find_all('a'): |
| href = link.get('href') |
| links.append(href) |
| return links |
|
|
|
|
| def get_text_content(soup): |
| text_elements = [] |
| for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']: |
| elements = soup.find_all(tag) |
| for element in elements: |
| text_elements.append(element.get_text()) |
| return text_elements |
|
|
| def get_title(soup): |
| title = soup.find('title').get_text() |
| return title |
|
|
| @app.get("/get_scraped_data") |
| async def get_data(url: str): |
| import requests |
| from bs4 import BeautifulSoup |
| headers = {'User-Agent': 'Mozilla/5.0'} |
| response = requests.get(url, headers=headers) |
| soup = BeautifulSoup(response.content, 'html.parser') |
|
|
| title = Scraper.get_title(soup) |
| links = Scraper.get_links(soup) |
| text_content = Scraper.get_text_content(soup) |
| |
| if not links: |
| print("Running alternative scrapper") |
|
|
| try: |
| data = await Scraper.scrape(url) |
| return data |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
| else: |
| return return {"title": title, "URL": links, "Content": text_content} |
|
|
|
|