keyword_specific / main.py
poemsforaphrodite's picture
Upload main.py with huggingface_hub
cb40bf9 verified
import cohere
import requests
import os
from bs4 import BeautifulSoup
import gradio as gr
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
# Initialize Cohere client
COHERE_API_KEY = os.getenv('COHERE_API_KEY')
co = cohere.Client(COHERE_API_KEY)
def generate_embeddings(text_list, model_type):
if not text_list:
return []
model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
input_type = 'search_document' # This specifies that the input is plain text
response = co.embed(model=model, texts=text_list, input_type=input_type)
embeddings = response.embeddings
return embeddings
def fetch_content(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.get_text(separator=' ', strip=True)
return content
except requests.RequestException as e:
return str(e)
def get_embeddings_for_subpages(subpage_urls, model_type):
contents = [fetch_content(subpage) for subpage in subpage_urls]
non_empty_contents = [content for content in contents if content]
subpage_urls = [subpage for subpage, content in zip(subpage_urls, contents) if content]
embeddings = generate_embeddings(non_empty_contents, model_type)
return subpage_urls, embeddings
def get_embedding_for_url(url, model_type):
content = fetch_content(url)
if content:
embedding = generate_embeddings([content], model_type)
return embedding[0]
return []
def compute_relevancy(subpage_urls, specific_urls, model_type):
subpages, subpage_embeddings = get_embeddings_for_subpages(subpage_urls, model_type)
relevancy_results = []
for specific_url in specific_urls:
specific_embedding = get_embedding_for_url(specific_url, model_type)
if not specific_embedding or not subpage_embeddings:
continue
relevancy_scores = cosine_similarity([specific_embedding], subpage_embeddings).flatten()
data = {'Specific URL': specific_url, 'Subpage URL': subpages, 'Relevancy Score': relevancy_scores}
relevancy_results.append(pd.DataFrame(data))
if relevancy_results:
result_df = pd.concat(relevancy_results)
else:
result_df = pd.DataFrame(columns=['Specific URL', 'Subpage URL', 'Relevancy Score'])
return result_df
def process_urls(subpage_urls_text, specific_urls_text, model_type, file=None):
subpage_urls = subpage_urls_text.split('\n') if subpage_urls_text else []
specific_urls = specific_urls_text.split('\n') if specific_urls_text else []
if file is not None:
if file.name.endswith('.csv'):
df = pd.read_csv(file.name)
elif file.name.endswith('.xlsx') or file.name.endswith('.xls'):
df = pd.read_excel(file.name)
else:
return pd.DataFrame(columns=['Specific URL', 'Subpage URL', 'Relevancy Score']), None
specific_urls.extend(df.iloc[:, 0].tolist())
result_df = compute_relevancy(subpage_urls, specific_urls, model_type)
if not result_df.empty:
result_df.to_csv('relevancy_scores.csv', index=False)
return result_df, 'relevancy_scores.csv'
interface = gr.Interface(
fn=process_urls,
inputs=[
gr.Textbox(label="Enter Subpage URLs (one per line)", lines=5),
gr.Textbox(label="Enter Specific URLs (one per line, leave empty if uploading file)", lines=5),
gr.Radio(['english', 'multilingual'], label="Select Model Type"),
gr.File(label="Upload File with Specific URLs (CSV, XLSX, XLS)")
],
outputs=[gr.Dataframe(label="Relevancy Scores"), gr.File(label="Download CSV")],
title="URL Relevancy with Cohere",
description="Enter subpage URLs (one per line) and either multiple specific URLs (one per line) or upload a file with specific URLs to compute relevancy scores."
)
if __name__ == "__main__":
interface.launch()