Spaces:

poemsforaphrodite
/

keyword_specific

Paused

App Files Files Community

keyword_specific / main.py

poemsforaphrodite

Upload main.py with huggingface_hub

cb40bf9 verified almost 2 years ago

raw

history blame contribute delete

3.99 kB

	import cohere
	import requests
	import os
	from bs4 import BeautifulSoup
	import gradio as gr
	import pandas as pd
	from sklearn.metrics.pairwise import cosine_similarity

	# Initialize Cohere client
	COHERE_API_KEY = os.getenv('COHERE_API_KEY')
	co = cohere.Client(COHERE_API_KEY)

	def generate_embeddings(text_list, model_type):
	if not text_list:
	return []

	model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
	input_type = 'search_document' # This specifies that the input is plain text
	response = co.embed(model=model, texts=text_list, input_type=input_type)
	embeddings = response.embeddings
	return embeddings

	def fetch_content(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	content = soup.get_text(separator=' ', strip=True)
	return content
	except requests.RequestException as e:
	return str(e)

	def get_embeddings_for_subpages(subpage_urls, model_type):
	contents = [fetch_content(subpage) for subpage in subpage_urls]
	non_empty_contents = [content for content in contents if content]
	subpage_urls = [subpage for subpage, content in zip(subpage_urls, contents) if content]
	embeddings = generate_embeddings(non_empty_contents, model_type)
	return subpage_urls, embeddings

	def get_embedding_for_url(url, model_type):
	content = fetch_content(url)
	if content:
	embedding = generate_embeddings([content], model_type)
	return embedding[0]
	return []

	def compute_relevancy(subpage_urls, specific_urls, model_type):
	subpages, subpage_embeddings = get_embeddings_for_subpages(subpage_urls, model_type)
	relevancy_results = []

	for specific_url in specific_urls:
	specific_embedding = get_embedding_for_url(specific_url, model_type)
	if not specific_embedding or not subpage_embeddings:
	continue

	relevancy_scores = cosine_similarity([specific_embedding], subpage_embeddings).flatten()
	data = {'Specific URL': specific_url, 'Subpage URL': subpages, 'Relevancy Score': relevancy_scores}
	relevancy_results.append(pd.DataFrame(data))

	if relevancy_results:
	result_df = pd.concat(relevancy_results)
	else:
	result_df = pd.DataFrame(columns=['Specific URL', 'Subpage URL', 'Relevancy Score'])

	return result_df

	def process_urls(subpage_urls_text, specific_urls_text, model_type, file=None):
	subpage_urls = subpage_urls_text.split('\n') if subpage_urls_text else []
	specific_urls = specific_urls_text.split('\n') if specific_urls_text else []

	if file is not None:
	if file.name.endswith('.csv'):
	df = pd.read_csv(file.name)
	elif file.name.endswith('.xlsx') or file.name.endswith('.xls'):
	df = pd.read_excel(file.name)
	else:
	return pd.DataFrame(columns=['Specific URL', 'Subpage URL', 'Relevancy Score']), None

	specific_urls.extend(df.iloc[:, 0].tolist())

	result_df = compute_relevancy(subpage_urls, specific_urls, model_type)
	if not result_df.empty:
	result_df.to_csv('relevancy_scores.csv', index=False)
	return result_df, 'relevancy_scores.csv'

	interface = gr.Interface(
	fn=process_urls,
	inputs=[
	gr.Textbox(label="Enter Subpage URLs (one per line)", lines=5),
	gr.Textbox(label="Enter Specific URLs (one per line, leave empty if uploading file)", lines=5),
	gr.Radio(['english', 'multilingual'], label="Select Model Type"),
	gr.File(label="Upload File with Specific URLs (CSV, XLSX, XLS)")
	],
	outputs=[gr.Dataframe(label="Relevancy Scores"), gr.File(label="Download CSV")],
	title="URL Relevancy with Cohere",
	description="Enter subpage URLs (one per line) and either multiple specific URLs (one per line) or upload a file with specific URLs to compute relevancy scores."
	)

	if __name__ == "__main__":
	interface.launch()