ocr_api2

Sleeping

ocr_api2 / main.py

Update main.py

fdf1d1f verified over 1 year ago

1.79 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from scraper import Scraper


	try: from pip._internal.operations import freeze
	except ImportError: # pip < 10.0
	from pip.operations import freeze

	pkgs = freeze.freeze()
	for pkg in pkgs: print(pkg)

	app = FastAPI()
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	@app.get("/get_scraped_data")
	async def get_data(url: str):
	import requests
	from bs4 import BeautifulSoup

	# URL of the page to scrape
	#url = "https://www.imf.org/en/News/Articles/2024/03/21/pr2494-sri-lanka-imf-staff-level-agreement-for-second-review-sla"
	url = url

	# Send a GET request to the URL
	response = requests.get(url)

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the page content
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract all text content (paragraphs, headers, etc.)
	elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
	body_text = "\n".join([element.get_text().strip() for element in elements])

	# Extract all links
	links = []
	for a_tag in soup.find_all('a', href=True):
	links.append(a_tag['href'])

	# Print the extracted information
	print("Body Text:")
	print(body_text)
	print("\nLinks:")
	for link in links:
	print(link)
	else:
	print("Failed to retrieve the webpage")
	return "done"
	try:
	data = await Scraper.scrape(url)
	return data
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))