lightbulb / ranking.py

Upload 20 files

e1392d6 verified over 1 year ago

8.87 kB

	# ranking.py
	import torch
	import torch.nn as nn
	import torch.optim as optim
	import pandas as pd
	from sentence_transformers import SentenceTransformer, util
	import numpy as np
	from sklearn.preprocessing import MinMaxScaler
	from collections import Counter
	import re
	import string
	from collections import Counter
	from sklearn.feature_extraction.text import TfidfVectorizer
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import word_tokenize
	import spacy

	def truncate_text(text, max_length=1024):
	tokens = text.split()
	if len(tokens) > max_length:
	return ' '.join(tokens[:max_length])
	return text

	class RankingNN(nn.Module):
	def __init__(self, input_size=7):
	super(RankingNN, self).__init__()
	self.fc1 = nn.Linear(input_size, 64)
	self.fc2 = nn.Linear(64, 32)
	self.fc3 = nn.Linear(32, 16)
	self.fc4 = nn.Linear(16, 1)
	self.dropout = nn.Dropout(0.2)

	def forward(self, x):
	x = torch.relu(self.fc1(x))
	x = self.dropout(x)
	x = torch.relu(self.fc2(x))
	x = self.dropout(x)
	x = torch.relu(self.fc3(x))
	x = self.fc4(x)
	return x

	transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
	ranking_model = RankingNN()
	optimizer = optim.Adam(ranking_model.parameters(), lr=0.001, weight_decay=1e-5)
	criterion = nn.MSELoss()
	scaler = MinMaxScaler()

	# Download necessary resources
	import nltk
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')

	# Initialize resources
	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()
	nlp = spacy.load("en_core_web_sm") # Small model to keep compute low

	def preprocess_text(text):
	"""
	Preprocess the input text by lowercasing, removing punctuation, and filtering out stopwords.
	Lemmatization is applied as well.
	"""
	# Lowercase the text
	text = text.lower()

	# Remove punctuation using regex
	text = re.sub(r'[' + string.punctuation + ']', ' ', text)

	# Tokenize the text into words
	words = word_tokenize(text)

	# Lemmatize, filter out stopwords and non-alphabetic words
	processed_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]

	return processed_words

	def extract_named_entities(text):
	"""
	Extract named entities (e.g., people, organizations, locations) from the text.
	"""
	doc = nlp(text)
	named_entities = [ent.text for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE", "LOC"}]
	return named_entities

	def extract_keywords_tfidf(corpus, text, n=5):
	"""
	Extract keywords from the text using TF-IDF, combined with Named Entity Recognition and lemmatization.
	"""
	# Preprocess the text and the entire corpus
	preprocessed_texts = [' '.join(preprocess_text(doc)) for doc in corpus]
	preprocessed_text = ' '.join(preprocess_text(text))

	# Named entities extraction
	named_entities = extract_named_entities(text)

	# Use TF-IDF vectorizer to find the most important words
	vectorizer = TfidfVectorizer(max_features=1000) # Keep it light, max 1000 features
	X = vectorizer.fit_transform(preprocessed_texts)

	# Get the feature names (i.e., the words)
	feature_names = vectorizer.get_feature_names_out()

	# Transform the current text into TF-IDF scores
	response = vectorizer.transform([preprocessed_text])
	tfidf_scores = zip(feature_names, response.toarray()[0])

	# Sort by TF-IDF score
	sorted_tfidf = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

	# Combine top TF-IDF words with named entities for more richness
	keywords = [word for word, score in sorted_tfidf[:n]]
	combined_keywords = keywords + named_entities

	return combined_keywords[:n]

	def extract_keywords(text, corpus, n=5):
	"""
	Wrapper function that combines preprocessing, TF-IDF, and Named Entity Recognition to extract top N keywords.
	"""
	if not text.strip():
	return []

	# Extract keywords using the TF-IDF based approach
	keywords = extract_keywords_tfidf(corpus, text, n)

	# If no meaningful keywords are found, fallback to keyword frequency
	if not keywords:
	return extract_fallback_keywords(text, n)

	return keywords

	def extract_fallback_keywords(text, n=5):
	"""
	Fallback method to extract keywords based on word frequency in case TF-IDF or NER fails.
	"""
	words = preprocess_text(text)
	word_freq = Counter(words)
	return [word for word, _ in word_freq.most_common(n)]

	def calculate_keyword_overlap(query_keywords, result_keywords):
	if len(query_keywords) == 0:
	return 0 # No keywords in query, so overlap is 0
	return len(set(query_keywords) & set(result_keywords)) / len(query_keywords)

	def train_ranking_model(query, results, corpus=None, epochs=1):
	query = truncate_text(query)
	if not results:
	print("No results available. Skipping training.")
	return []

	if corpus is None:
	# If no corpus is provided, use results as a fallback
	corpus = [truncate_text(result['content']) for result in results if 'content' in result]

	query_embedding = transformer_model.encode(query)
	query_keywords = extract_keywords(query, corpus)

	training_data = []
	target_scores = []

	for result in results:
	# Truncate content
	content = truncate_text(result['content'])
	content_embedding = transformer_model.encode(content)

	# Handle missing 'title' and 'meta' fields with default values, and truncate
	title = truncate_text(result.get('title', ''))
	title_embedding = transformer_model.encode(title)

	meta_description = truncate_text(result.get('meta', {}).get('description', ''))
	meta_description_embedding = transformer_model.encode(meta_description)

	content_similarity = util.pytorch_cos_sim(query_embedding, content_embedding).item()
	title_similarity = util.pytorch_cos_sim(query_embedding, title_embedding).item()
	meta_description_similarity = util.pytorch_cos_sim(query_embedding, meta_description_embedding).item()

	# Handle missing metadata by providing default values
	content_length = result.get('meta', {}).get('content_length', 0)
	total_links = result.get('meta', {}).get('total_links', 0)

	result_keywords = extract_keywords(content, corpus)
	keyword_overlap = calculate_keyword_overlap(query_keywords, result_keywords)
	domain_authority = get_domain_authority(result.get('link', ''))

	features = [
	content_similarity, title_similarity, meta_description_similarity,
	content_length, total_links, keyword_overlap, domain_authority
	]

	training_data.append(features)

	target_score = (0.4 * content_similarity + 0.3 * title_similarity +
	0.2 * meta_description_similarity + 0.1 * keyword_overlap)
	target_scores.append(target_score)

	# Normalize features
	training_data = scaler.fit_transform(training_data)
	training_data_tensor = torch.tensor(training_data, dtype=torch.float32)
	target_scores_tensor = torch.tensor(target_scores, dtype=torch.float32).unsqueeze(1)

	# Training loop
	for epoch in range(epochs):
	optimizer.zero_grad()
	predicted_scores = ranking_model(training_data_tensor)
	loss = criterion(predicted_scores, target_scores_tensor)
	loss.backward()
	optimizer.step()

	if (epoch + 1) % 5 == 0:
	print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

	# Predict final scores and rank results
	with torch.no_grad():
	final_scores = ranking_model(training_data_tensor).squeeze().tolist()

	# Ensure final_scores is always a list
	if isinstance(final_scores, float):
	final_scores = [final_scores]

	for result, score in zip(results, final_scores):
	result['predicted_score'] = score

	ranked_results = sorted(results, key=lambda x: x['predicted_score'], reverse=True)
	return ranked_results

	def get_domain_authority(url):
	# Placeholder function - replace with actual domain authority data if available
	high_authority_domains = ['arxiv.org', 'ncbi.nlm.nih.gov', 'nature.com', 'science.org']
	medium_authority_domains = ['wikipedia.org', 'stackexchange.com', 'github.com']

	for domain in high_authority_domains:
	if domain in url:
	return 1.0
	for domain in medium_authority_domains:
	if domain in url:
	return 0.7
	return 0.5