| import cohere |
| import requests |
| import os |
| from bs4 import BeautifulSoup |
| import gradio as gr |
| import pandas as pd |
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
| |
| COHERE_API_KEY = os.getenv('COHERE_API_KEY') |
| co = cohere.Client(COHERE_API_KEY) |
|
|
| def generate_embeddings(text_list, model_type): |
| if not text_list: |
| return [] |
|
|
| model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0' |
| input_type = 'search_document' |
| response = co.embed(model=model, texts=text_list, input_type=input_type) |
| embeddings = response.embeddings |
| return embeddings |
|
|
| def fetch_content(url): |
| try: |
| response = requests.get(url) |
| response.raise_for_status() |
| soup = BeautifulSoup(response.text, 'html.parser') |
| content = soup.get_text(separator=' ', strip=True) |
| return content |
| except requests.RequestException as e: |
| return str(e) |
|
|
| def get_embeddings_for_subpages(subpage_urls, model_type): |
| contents = [fetch_content(subpage) for subpage in subpage_urls] |
| non_empty_contents = [content for content in contents if content] |
| subpage_urls = [subpage for subpage, content in zip(subpage_urls, contents) if content] |
| embeddings = generate_embeddings(non_empty_contents, model_type) |
| return subpage_urls, embeddings |
|
|
| def get_embedding_for_url(url, model_type): |
| content = fetch_content(url) |
| if content: |
| embedding = generate_embeddings([content], model_type) |
| return embedding[0] |
| return [] |
|
|
| def compute_relevancy(subpage_urls, specific_urls, model_type): |
| subpages, subpage_embeddings = get_embeddings_for_subpages(subpage_urls, model_type) |
| relevancy_results = [] |
|
|
| for specific_url in specific_urls: |
| specific_embedding = get_embedding_for_url(specific_url, model_type) |
| if not specific_embedding or not subpage_embeddings: |
| continue |
|
|
| relevancy_scores = cosine_similarity([specific_embedding], subpage_embeddings).flatten() |
| data = {'Specific URL': specific_url, 'Subpage URL': subpages, 'Relevancy Score': relevancy_scores} |
| relevancy_results.append(pd.DataFrame(data)) |
|
|
| if relevancy_results: |
| result_df = pd.concat(relevancy_results) |
| else: |
| result_df = pd.DataFrame(columns=['Specific URL', 'Subpage URL', 'Relevancy Score']) |
|
|
| return result_df |
|
|
| def process_urls(subpage_urls_text, specific_urls_text, model_type, file=None): |
| subpage_urls = subpage_urls_text.split('\n') if subpage_urls_text else [] |
| specific_urls = specific_urls_text.split('\n') if specific_urls_text else [] |
| |
| if file is not None: |
| if file.name.endswith('.csv'): |
| df = pd.read_csv(file.name) |
| elif file.name.endswith('.xlsx') or file.name.endswith('.xls'): |
| df = pd.read_excel(file.name) |
| else: |
| return pd.DataFrame(columns=['Specific URL', 'Subpage URL', 'Relevancy Score']), None |
|
|
| specific_urls.extend(df.iloc[:, 0].tolist()) |
|
|
| result_df = compute_relevancy(subpage_urls, specific_urls, model_type) |
| if not result_df.empty: |
| result_df.to_csv('relevancy_scores.csv', index=False) |
| return result_df, 'relevancy_scores.csv' |
|
|
| interface = gr.Interface( |
| fn=process_urls, |
| inputs=[ |
| gr.Textbox(label="Enter Subpage URLs (one per line)", lines=5), |
| gr.Textbox(label="Enter Specific URLs (one per line, leave empty if uploading file)", lines=5), |
| gr.Radio(['english', 'multilingual'], label="Select Model Type"), |
| gr.File(label="Upload File with Specific URLs (CSV, XLSX, XLS)") |
| ], |
| outputs=[gr.Dataframe(label="Relevancy Scores"), gr.File(label="Download CSV")], |
| title="URL Relevancy with Cohere", |
| description="Enter subpage URLs (one per line) and either multiple specific URLs (one per line) or upload a file with specific URLs to compute relevancy scores." |
| ) |
|
|
| if __name__ == "__main__": |
| interface.launch() |