| | from fastapi import FastAPI, HTTPException |
| | from fastapi.middleware.cors import CORSMiddleware |
| | from scraper import Scraper |
| |
|
| |
|
| | try: from pip._internal.operations import freeze |
| | except ImportError: |
| | from pip.operations import freeze |
| |
|
| | pkgs = freeze.freeze() |
| | for pkg in pkgs: print(pkg) |
| |
|
| | app = FastAPI() |
| | app.add_middleware( |
| | CORSMiddleware, |
| | allow_origins=["*"], |
| | allow_credentials=True, |
| | allow_methods=["*"], |
| | allow_headers=["*"], |
| | ) |
| |
|
| | @app.get("/get_scraped_data") |
| | async def get_data(url: str): |
| | import requests |
| | from bs4 import BeautifulSoup |
| | |
| | |
| | |
| | url = url |
| | |
| | |
| | response = requests.get(url) |
| | |
| | |
| | if response.status_code == 200: |
| | |
| | soup = BeautifulSoup(response.content, 'html.parser') |
| | |
| | |
| | elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']) |
| | body_text = "\n".join([element.get_text().strip() for element in elements]) |
| | |
| | |
| | links = [] |
| | for a_tag in soup.find_all('a', href=True): |
| | links.append(a_tag['href']) |
| | |
| | |
| | print("Body Text:") |
| | print(body_text) |
| | print("\nLinks:") |
| | for link in links: |
| | print(link) |
| | else: |
| | print("Failed to retrieve the webpage") |
| | return "done" |
| | try: |
| | data = await Scraper.scrape(url) |
| | return data |
| | except Exception as e: |
| | raise HTTPException(status_code=500, detail=str(e)) |
| |
|
| |
|