File size: 3,981 Bytes
5c33ee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75eb0e0
5c33ee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import gradio as gr
import requests
from bs4 import BeautifulSoup
import cohere
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
import asyncio
from apify_client import ApifyClient
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

# Load environment variables
load_dotenv()
#testing
# Initialize Cohere client
co = cohere.Client(os.getenv('COHERE_API_KEY'))

# Initialize Apify client
apify_client = ApifyClient(os.getenv('APIFY_API_KEY'))

# Load the LeetCode dataset
dataset = load_dataset("RayBernard/leetcode", split="train")
df = pd.DataFrame(dataset)

# Initialize sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare embeddings for the dataset
df['embeddings'] = df['title'].apply(lambda x: model.encode(x))

def search_apify(query):
    query = " ".join(query.split())
    run_input = {
        "queries": query,
        "resultsPerPage": 10,
        "maxPagesPerQuery": 1,
        "languageCode": "",
        "mobileResults": False,
        "includeUnfilteredResults": False,
        "saveHtml": False,
        "saveHtmlToKeyValueStore": False,
        "includeIcons": False
    }
    run = apify_client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
    items = apify_client.dataset(run["defaultDatasetId"]).list_items().items
    urls = [item['url'] for item in items[0]['organicResults'] if 'url' in item]
    return urls[:10]

def scrape_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except:
        return ""

def get_embedding(text):
    return model.encode(text)

def calculate_relevancy(query_embedding, result_embedding):
    query_embedding = np.array(query_embedding).reshape(1, -1)
    result_embedding = np.array(result_embedding).reshape(1, -1)
    return cosine_similarity(query_embedding, result_embedding)[0][0]

def search_leetcode(query, top_k=5):
    query_embedding = get_embedding(query)
    df['similarity'] = df['embeddings'].apply(lambda x: calculate_relevancy(query_embedding, x))
    results = df.sort_values('similarity', ascending=False).head(top_k)
    return results[['title', 'difficulty', 'similarity']]

async def search_and_score(query):
    # Search using Apify
    apify_results = search_apify(query)
    
    # Get query embedding
    query_embedding = get_embedding(query)
    
    # Process each Apify result
    scored_results = []
    for url in apify_results:
        content = scrape_content(url)
        content_embedding = get_embedding(content)
        relevancy_score = calculate_relevancy(query_embedding, content_embedding)
        scored_results.append((url, relevancy_score))
    
    # Sort Apify results by relevancy score
    scored_results.sort(key=lambda x: x[1], reverse=True)
    
    # Search LeetCode dataset
    leetcode_results = search_leetcode(query)
    
    # Combine results
    combined_results = [
        ["Apify Results", "Relevancy Score"],
        *[[f'<a href="{url}" target="_blank">{url}</a>', f'{score:.4f}'] for url, score in scored_results],
        ["LeetCode Results", "Similarity Score"],
        *[["LeetCode: " + row['title'], f"{row['similarity']:.4f}"] for _, row in leetcode_results.iterrows()]
    ]
    
    return combined_results

# Wrapper function to run async function in sync context
def search_and_score_wrapper(query):
    return asyncio.run(search_and_score(query))

# Create Gradio interface
iface = gr.Interface(
    fn=search_and_score_wrapper,
    inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."),
    outputs=gr.Dataframe(headers=["Result", "Score"], datatype=["html", "number"]),
    title="Search Relevancy Scorer with RAG",
    description="Enter a search query to get relevant results from web search and LeetCode dataset."
)

# Launch the interface
iface.launch()