Spaces:

poemsforaphrodite
/

keyword

Build error

App Files Files Community

keyword / main.py

poemsforaphrodite

Update main.py

d8b26a9 verified almost 2 years ago

raw

history blame contribute delete

4.46 kB

	import cohere
	import requests
	from bs4 import BeautifulSoup
	import gradio as gr
	import pandas as pd
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	import os

	# Initialize Cohere Client
	COHERE_API_KEY = os.getenv('COHERE_API_KEY')
	co = cohere.Client(COHERE_API_KEY)

	def generate_embeddings(text_list, model_type):
	if not text_list:
	return []

	model = 'embed-english-v3.0' if model_type == 'English' else 'embed-multilingual-v3.0'
	response = co.embed(model=model, input_type='classification', texts=text_list)
	embeddings = response.embeddings
	return embeddings

	def fetch_content(url):
	try:
	response = requests.get(url)
	response.raise_for_status() # Raise an HTTPError for bad responses
	soup = BeautifulSoup(response.text, 'html.parser')
	content = soup.get_text(separator=' ', strip=True)
	return content
	except requests.RequestException as e:
	print(f"Error fetching content from {url}: {e}")
	return ""

	def get_embeddings_for_urls(urls, model_type):
	contents = [fetch_content(url) for url in urls]
	non_empty_contents = [content for content in contents if content]
	valid_urls = [url for url, content in zip(urls, contents) if content]

	embeddings = generate_embeddings(non_empty_contents, model_type)

	return valid_urls, embeddings, non_empty_contents

	def calculate_relevance(content, content_embedding, keywords, model_type):
	keyword_embeddings = generate_embeddings(keywords, model_type)
	relevance_scores = cosine_similarity([content_embedding], keyword_embeddings)[0]

	data = {'Keyword': keywords, 'RelevanceScore': relevance_scores * 100} # Include negative scores

	result_df = pd.DataFrame(data)
	return result_df

	def process_input(urls, keywords, keywords_file, model_type):
	url_list = [url.strip() for url in urls.split('\n') if url.strip()] # Clean up and split the URLs

	if keywords_file is not None:
	# Detect the file extension and read the file accordingly
	if keywords_file.name.endswith('.csv'):
	keywords_df = pd.read_csv(keywords_file)
	elif keywords_file.name.endswith('.xlsx'):
	keywords_df = pd.read_excel(keywords_file)
	else:
	raise ValueError("Unsupported file format. Please upload a CSV or XLSX file.")

	keywords = keywords_df.iloc[:, 0].tolist() # Assuming keywords are in the first column
	else:
	keywords = [keyword.strip() for keyword in keywords.split(',') if keyword.strip()]

	valid_urls, url_embeddings, url_contents = get_embeddings_for_urls(url_list, model_type)

	result_df_list = []

	# Calculate relevance for each URL content
	for content, embedding in zip(url_contents, url_embeddings):
	result_df = calculate_relevance(content, embedding, keywords, model_type)
	result_df['Content'] = content[:100] + "..." # Adding a short preview of the content
	result_df_list.append(result_df)

	final_result_df = pd.concat(result_df_list).reset_index(drop=True)

	# Save the result to a CSV file
	final_result_df.to_csv("relevance_scores.csv", index=False)

	# Save the scraped content to a CSV file
	scraped_content_df = pd.DataFrame({'URL': valid_urls, 'Content': url_contents})
	scraped_content_df.to_csv("scraped_content.csv", index=False)

	return final_result_df, "relevance_scores.csv", "scraped_content.csv"

	interface = gr.Interface(
	fn=process_input,
	inputs=[
	gr.Textbox(label="Enter URLs (one per line)", lines=5, placeholder="https://example.com\nhttps://example.org"),
	gr.Textbox(label="Enter Keywords (comma-separated)", placeholder="keyword1, keyword2, keyword3"),
	gr.File(label="Upload Keywords File (keywords.csv or keywords.xlsx)"),
	gr.Radio(label="Select Model Type", choices=['English', 'Multilingual'], value='Multilingual')
	],
	outputs=[
	gr.Dataframe(label="Relevance Scores"),
	gr.File(label="Download Relevance Scores CSV"),
	gr.File(label="Download Scraped Content CSV")
	],
	title="Keyword Relevance to URLs",
	description="Enter URLs (one per line), enter keywords manually, or upload a 'keywords.csv' or 'keywords.xlsx' file to check their relevance."
	)

	if __name__ == "__main__":
	# Launch the Gradio interface
	interface.launch(share=True, show_error=True)