ocr_api2

Sleeping

ocr_api2 / main.py

Update main.py

00a8453 verified over 1 year ago

1.74 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from scraper import Scraper


	try: from pip._internal.operations import freeze
	except ImportError: # pip < 10.0
	from pip.operations import freeze

	pkgs = freeze.freeze()
	for pkg in pkgs: print(pkg)

	app = FastAPI()
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	def get_links(soup):
	links = []
	for link in soup.find_all('a'):
	href = link.get('href')
	links.append(href)
	return links


	def get_text_content(soup):
	text_elements = []
	for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
	elements = soup.find_all(tag)
	for element in elements:
	text_elements.append(element.get_text())
	return text_elements

	def get_title(soup):
	title = soup.find('title').get_text()
	return title

	@app.get("/get_scraped_data")
	async def get_data(url: str):
	import requests
	from bs4 import BeautifulSoup
	headers = {'User-Agent': 'Mozilla/5.0'}
	response = requests.get(url, headers=headers)
	soup = BeautifulSoup(response.content, 'html.parser')

	title = Scraper.get_title(soup)
	links = Scraper.get_links(soup)
	text_content = Scraper.get_text_content(soup)

	if not links:
	print("Running alternative scrapper")

	try:
	data = await Scraper.scrape(url)
	return data
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))
	else:
	return return {"title": title, "URL": links, "Content": text_content}