MarCognity-AI / src /evaluation /ranking_originality.py

Create ranking_originality.py

9397062 verified 5 months ago

3.77 kB

	# © 2025 Elena Marziali — Code released under Apache 2.0 license.
	# See LICENSE in the repository for details.
	# Removal of this copyright is prohibited.

	# Sample data for ranking
	data = np.array([
	[120, 45, 1, 2023], # Citations, h-index, peer review, year
	[50, 30, 1, 2020],
	[10, 15, 0, 2018]
	])

	labels = [95, 70, 30] # Academic impact score

	# Model training
	ranking_model = RandomForestRegressor(n_estimators=100)
	ranking_model.fit(data, labels)

	# Ranking prediction
	def calculate_impact_score(citations, h_index, peer_review, publication_year):
	paper_data = np.array([[citations, h_index, peer_review, publication_year]])
	score = ranking_model.predict(paper_data)
	return max(0, score[0]) # Ensure non-negative

	# Usage example
	impact_score = calculate_impact_score(80, 40, 1, 2024)
	print(f"Estimated score: {impact_score}")

	# Ranking model
	from sklearn.ensemble import RandomForestRegressor

	# Sample data for ranking
	data = np.array([
	[120, 45, 1, 2023], # Citations, h-index, peer review, year
	[50, 30, 1, 2020],
	[10, 15, 0, 2018]
	])

	labels = [95, 70, 30] # Academic impact score

	# Model training
	ranking_model = RandomForestRegressor(n_estimators=100)
	ranking_model.fit(data, labels)

	# Ranking prediction
	new_paper = np.array([[80, 40, 1, 2024]])
	score = ranking_model.predict(new_paper)
	print(f"Estimated score: {score[0]}")

	# === Scientific originality evaluation ===
	def evaluate_hypothesis_novelty(hypothesis, existing_articles, threshold=0.7):
	"""
	Compares the hypothesis with existing articles using semantic embeddings.
	Returns:
	- average similarity score
	- similar articles
	- qualitative assessment of originality
	"""
	try:
	emb_hypothesis = model_embedding.encode([hypothesis])
	emb_articles = model_embedding.encode([a["abstract"] for a in existing_articles if "abstract" in a])

	similarity = np.dot(emb_hypothesis, emb_articles.T) / (
	np.linalg.norm(emb_hypothesis) * np.linalg.norm(emb_articles, axis=1)
	)
	average = round(float(np.mean(similarity)), 3)

	similar_articles = [
	existing_articles[i]["title"]
	for i, score in enumerate(similarity[0]) if score > threshold
	]

	if average < 0.4:
	assessment = "High originality: hypothesis is rarely present in the literature."
	elif average < 0.7:
	assessment = "Moderate originality: related concepts exist."
	else:
	assessment = "Low originality: hypothesis is already widely discussed."

	return {
	"novelty_score": average,
	"similar_articles": similar_articles,
	"assessment": assessment
	}

	except Exception as e:
	logging.error(f"[evaluate_novelty] Error during originality evaluation: {e}")
	return {
	"novelty_score": 0.0,
	"similar_articles": [],
	"assessment": "Error during originality evaluation."
	}

	# Automated paper review with AI
	async def review_paper(paper_text):
	""" Checks the methodology and citation quality of a paper. """
	methodology = await verify_methodology(paper_text)
	citations = await verify_citations(paper_text)
	return {"methodology": methodology, "citations": citations}

	async def validate_hypothesis(hypothesis):
	sources = await search_multi_database(hypothesis)
	score = calculate_impact_score(sources) # Based on citations, year, h-index, etc.
	summary = summarize_evidence(sources)
	return score, summary

	def summarize_evidence(sources):
	return "\n".join([
	f"- {a['title'][:80]}…" for a in sources if isinstance(a, dict) and 'title' in a
	]) if sources else "No evidence found."