Spaces:
Build error
Build error
| import cohere | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import os | |
| # Initialize Cohere Client | |
| COHERE_API_KEY = os.getenv('COHERE_API_KEY') | |
| co = cohere.Client(COHERE_API_KEY) | |
| def generate_embeddings(text_list, model_type): | |
| if not text_list: | |
| return [] | |
| model = 'embed-english-v3.0' if model_type == 'English' else 'embed-multilingual-v3.0' | |
| response = co.embed(model=model, input_type='classification', texts=text_list) | |
| embeddings = response.embeddings | |
| return embeddings | |
| def fetch_content(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() # Raise an HTTPError for bad responses | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| content = soup.get_text(separator=' ', strip=True) | |
| return content | |
| except requests.RequestException as e: | |
| print(f"Error fetching content from {url}: {e}") | |
| return "" | |
| def get_embeddings_for_urls(urls, model_type): | |
| contents = [fetch_content(url) for url in urls] | |
| non_empty_contents = [content for content in contents if content] | |
| valid_urls = [url for url, content in zip(urls, contents) if content] | |
| embeddings = generate_embeddings(non_empty_contents, model_type) | |
| return valid_urls, embeddings, non_empty_contents | |
| def calculate_relevance(content, content_embedding, keywords, model_type): | |
| keyword_embeddings = generate_embeddings(keywords, model_type) | |
| relevance_scores = cosine_similarity([content_embedding], keyword_embeddings)[0] | |
| data = {'Keyword': keywords, 'RelevanceScore': relevance_scores * 100} # Include negative scores | |
| result_df = pd.DataFrame(data) | |
| return result_df | |
| def process_input(urls, keywords, keywords_file, model_type): | |
| url_list = [url.strip() for url in urls.split('\n') if url.strip()] # Clean up and split the URLs | |
| if keywords_file is not None: | |
| # Detect the file extension and read the file accordingly | |
| if keywords_file.name.endswith('.csv'): | |
| keywords_df = pd.read_csv(keywords_file) | |
| elif keywords_file.name.endswith('.xlsx'): | |
| keywords_df = pd.read_excel(keywords_file) | |
| else: | |
| raise ValueError("Unsupported file format. Please upload a CSV or XLSX file.") | |
| keywords = keywords_df.iloc[:, 0].tolist() # Assuming keywords are in the first column | |
| else: | |
| keywords = [keyword.strip() for keyword in keywords.split(',') if keyword.strip()] | |
| valid_urls, url_embeddings, url_contents = get_embeddings_for_urls(url_list, model_type) | |
| result_df_list = [] | |
| # Calculate relevance for each URL content | |
| for content, embedding in zip(url_contents, url_embeddings): | |
| result_df = calculate_relevance(content, embedding, keywords, model_type) | |
| result_df['Content'] = content[:100] + "..." # Adding a short preview of the content | |
| result_df_list.append(result_df) | |
| final_result_df = pd.concat(result_df_list).reset_index(drop=True) | |
| # Save the result to a CSV file | |
| final_result_df.to_csv("relevance_scores.csv", index=False) | |
| # Save the scraped content to a CSV file | |
| scraped_content_df = pd.DataFrame({'URL': valid_urls, 'Content': url_contents}) | |
| scraped_content_df.to_csv("scraped_content.csv", index=False) | |
| return final_result_df, "relevance_scores.csv", "scraped_content.csv" | |
| interface = gr.Interface( | |
| fn=process_input, | |
| inputs=[ | |
| gr.Textbox(label="Enter URLs (one per line)", lines=5, placeholder="https://example.com\nhttps://example.org"), | |
| gr.Textbox(label="Enter Keywords (comma-separated)", placeholder="keyword1, keyword2, keyword3"), | |
| gr.File(label="Upload Keywords File (keywords.csv or keywords.xlsx)"), | |
| gr.Radio(label="Select Model Type", choices=['English', 'Multilingual'], value='Multilingual') | |
| ], | |
| outputs=[ | |
| gr.Dataframe(label="Relevance Scores"), | |
| gr.File(label="Download Relevance Scores CSV"), | |
| gr.File(label="Download Scraped Content CSV") | |
| ], | |
| title="Keyword Relevance to URLs", | |
| description="Enter URLs (one per line), enter keywords manually, or upload a 'keywords.csv' or 'keywords.xlsx' file to check their relevance." | |
| ) | |
| if __name__ == "__main__": | |
| # Launch the Gradio interface | |
| interface.launch(share=True, show_error=True) |