keyword / main.py
poemsforaphrodite's picture
Update main.py
d8b26a9 verified
import cohere
import requests
from bs4 import BeautifulSoup
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
# Initialize Cohere Client
COHERE_API_KEY = os.getenv('COHERE_API_KEY')
co = cohere.Client(COHERE_API_KEY)
def generate_embeddings(text_list, model_type):
if not text_list:
return []
model = 'embed-english-v3.0' if model_type == 'English' else 'embed-multilingual-v3.0'
response = co.embed(model=model, input_type='classification', texts=text_list)
embeddings = response.embeddings
return embeddings
def fetch_content(url):
try:
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.get_text(separator=' ', strip=True)
return content
except requests.RequestException as e:
print(f"Error fetching content from {url}: {e}")
return ""
def get_embeddings_for_urls(urls, model_type):
contents = [fetch_content(url) for url in urls]
non_empty_contents = [content for content in contents if content]
valid_urls = [url for url, content in zip(urls, contents) if content]
embeddings = generate_embeddings(non_empty_contents, model_type)
return valid_urls, embeddings, non_empty_contents
def calculate_relevance(content, content_embedding, keywords, model_type):
keyword_embeddings = generate_embeddings(keywords, model_type)
relevance_scores = cosine_similarity([content_embedding], keyword_embeddings)[0]
data = {'Keyword': keywords, 'RelevanceScore': relevance_scores * 100} # Include negative scores
result_df = pd.DataFrame(data)
return result_df
def process_input(urls, keywords, keywords_file, model_type):
url_list = [url.strip() for url in urls.split('\n') if url.strip()] # Clean up and split the URLs
if keywords_file is not None:
# Detect the file extension and read the file accordingly
if keywords_file.name.endswith('.csv'):
keywords_df = pd.read_csv(keywords_file)
elif keywords_file.name.endswith('.xlsx'):
keywords_df = pd.read_excel(keywords_file)
else:
raise ValueError("Unsupported file format. Please upload a CSV or XLSX file.")
keywords = keywords_df.iloc[:, 0].tolist() # Assuming keywords are in the first column
else:
keywords = [keyword.strip() for keyword in keywords.split(',') if keyword.strip()]
valid_urls, url_embeddings, url_contents = get_embeddings_for_urls(url_list, model_type)
result_df_list = []
# Calculate relevance for each URL content
for content, embedding in zip(url_contents, url_embeddings):
result_df = calculate_relevance(content, embedding, keywords, model_type)
result_df['Content'] = content[:100] + "..." # Adding a short preview of the content
result_df_list.append(result_df)
final_result_df = pd.concat(result_df_list).reset_index(drop=True)
# Save the result to a CSV file
final_result_df.to_csv("relevance_scores.csv", index=False)
# Save the scraped content to a CSV file
scraped_content_df = pd.DataFrame({'URL': valid_urls, 'Content': url_contents})
scraped_content_df.to_csv("scraped_content.csv", index=False)
return final_result_df, "relevance_scores.csv", "scraped_content.csv"
interface = gr.Interface(
fn=process_input,
inputs=[
gr.Textbox(label="Enter URLs (one per line)", lines=5, placeholder="https://example.com\nhttps://example.org"),
gr.Textbox(label="Enter Keywords (comma-separated)", placeholder="keyword1, keyword2, keyword3"),
gr.File(label="Upload Keywords File (keywords.csv or keywords.xlsx)"),
gr.Radio(label="Select Model Type", choices=['English', 'Multilingual'], value='Multilingual')
],
outputs=[
gr.Dataframe(label="Relevance Scores"),
gr.File(label="Download Relevance Scores CSV"),
gr.File(label="Download Scraped Content CSV")
],
title="Keyword Relevance to URLs",
description="Enter URLs (one per line), enter keywords manually, or upload a 'keywords.csv' or 'keywords.xlsx' file to check their relevance."
)
if __name__ == "__main__":
# Launch the Gradio interface
interface.launch(share=True, show_error=True)