Spaces:

tferhan
/

data_gov_ma

Sleeping

App Files Files Community

data_gov_ma / document_scrapped.py

tferhan

Update document_scrapped.py

84f216b verified over 1 year ago

raw

history blame contribute delete

5.89 kB

	import re
	from bs4 import BeautifulSoup
	import requests
	import json
	import io
	import fitz
	from pptx import Presentation
	from io import BytesIO
	import chardet
	from docx import Document
	import pandas as pd
	from sumarize import summarize
	from io import BytesIO
	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.converter import TextConverter
	from io import StringIO
	from pdfminer.layout import LAParams
	from pdfminer.pdfpage import PDFPage

	def trim_input_words(input_str, max_new_tokens = 512, max_total_tokens=32768):
	words = input_str.split()
	max_input_tokens = max_total_tokens - max_new_tokens

	if len(words) > max_input_tokens - 100:
	words = words[:max_input_tokens]
	trimmed_input_str = ' '.join(words)

	return trimmed_input_str

	def select_words_until_char_limit(s, char_limit):
	s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
	words = s_no_punct.split()
	selected_words = []
	total_chars = 0
	for word in words:
	if total_chars + len(word) + 1 <= char_limit:
	selected_words.append(word)
	total_chars += len(word) + 1 # add 1 for the space
	else:
	break
	f = trim_input_words(' '.join(selected_words))
	return f



	def downl(url):
	try:
	rq = requests.get(url)
	if rq.status_code != 200:
	return ""
	bs = BeautifulSoup(rq.text, features='lxml')
	lis = bs.find_all('ul', class_='dropdown-menu')[-1].find_all('li')
	link = lis[-1].find('a').get('href')
	print(link)
	return link
	except Exception as e:
	return ""


	def pdf(url):
	# Download the PDF content
	response = requests.get(url)
	pdf_content = response.content

	# Convert the bytes object to a file-like object
	pdf_file = BytesIO(pdf_content)

	# Extract text from the downloaded PDF content
	resource_manager = PDFResourceManager()
	fake_file_handle = StringIO()
	converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
	page_interpreter = PDFPageInterpreter(resource_manager, converter)

	for page in PDFPage.get_pages(pdf_file):
	page_interpreter.process_page(page)

	text = fake_file_handle.getvalue()
	f = select_words_until_char_limit(text, 30000)
	converter.close()
	fake_file_handle.close()
	return f


	def excel(link : str) -> str:
	try:
	response = requests.get(link)
	if response.status_code == 200:
	file_content = response.content
	df = pd.read_excel(BytesIO(file_content))
	if df.shape[0] > 50:
	sample_size = 50
	sample_df = df.sample(n=sample_size, random_state=42)
	else:
	sample_df = df
	json_data = sample_df.to_json(orient='records')
	js = json.loads(json_data)
	rs = select_words_until_char_limit(f"{js}", 32000)
	return rs
	else:
	print("Failed to download file")
	return "No dat avaible error"
	except Exception as e:
	print(e)
	return "No data avaible"


	def csv(link : str) -> str:
	try:
	response = requests.get(link)

	if response.status_code == 200:
	file_content = response.content
	detected_encoding = chardet.detect(file_content)['encoding']
	df = pd.read_csv(io.BytesIO(file_content), encoding=detected_encoding, sep=';')
	if df.empty:
	print("The DataFrame is empty.")
	return 'The data frame is empty'

	if df.shape[0] > 50:
	sample_size = 50
	sample_df = df.sample(n=sample_size, random_state=42)
	else:
	sample_df = df

	json_data = sample_df.to_json(orient='records')
	js = json.loads(json_data)
	rs = select_words_until_char_limit(f"{js}", 32000)
	return rs

	except Exception as e:
	return 'No data avaible'


	def docx(url : str) -> str:
	try:
	response = requests.get(url)
	response.raise_for_status() # Ensure we notice bad responses

	# Read the .docx file
	file_stream = io.BytesIO(response.content)
	doc = Document(file_stream)

	# Extract text
	full_text = []
	for para in doc.paragraphs:
	full_text.append(para.text)

	f = "\n".join(full_text)
	n = select_words_until_char_limit(f, 32000)
	return n
	except Exception as e:
	print(f"An error occurred: {e}")
	return 'No data avaible'




	def pptx(url : str) -> str:
	try:
	response = requests.get(url)
	response.raise_for_status()

	# Read the .pptx file
	file_stream = io.BytesIO(response.content)
	presentation = Presentation(file_stream)

	# Extract text
	full_text = []
	for slide in presentation.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	full_text.append(shape.text)

	g = "\n".join(full_text)
	c = select_words_until_char_limit(g, 32000)
	return c
	except Exception as e:
	print(f"An error occurred: {e}")
	return 'No data avaible'

	def get_data(url):
	ki = url.replace('\nObservation', '').replace('"\nObservation', '')
	jo = downl(ki)
	ext = jo.split(".")[-1]
	if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
	rs = excel(jo)
	return summarize.invoke({"input":rs})
	elif ext == 'pdf':
	rs = pdf(jo)
	return summarize.invoke({"input":rs})
	elif ext == 'docx':
	rs = docx(jo)
	return summarize.invoke({"input":rs})
	elif ext == 'csv':
	rs = csv(jo)
	return summarize.invoke({"input":rs})
	elif ext == 'pptx' or ext == 'ppt':
	rs = pptx(jo)
	return summarize.invoke({"input":rs})
	elif ext == 'doc':
	return "L'extension .doc non supportée."
	return "No data returned"