Spaces:

mabosaimi
/

arabic-summarizer-classifier

Sleeping

App Files Files Community

arabic-summarizer-classifier / summarizer.py

mabosaimi

Fkhrayef (#1)

5fc9256 verified 8 months ago

raw

history blame contribute delete

3.16 kB

	import re
	import numpy as np
	import joblib
	from typing import Dict, Any
	from preprocessor import preprocess_for_summarization


	class ArabicSummarizer:
	"""Arabic text summarizer using TF-IDF scoring."""

	def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"):
	self.vectorizer = joblib.load(vectorizer_path)

	def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
	"""Summarize text by selecting top-scored sentences."""
	cleaned_text = preprocess_for_summarization(text)

	sentences = re.split(r"[.!؟\n]+", cleaned_text)
	sentences = [s.strip() for s in sentences if s.strip()]

	if len(sentences) <= num_sentences:
	return {
	"summary": text,
	"original_sentence_count": int(len(sentences)),
	"summary_sentence_count": int(len(sentences)),
	"sentences": sentences,
	"selected_indices": list(range(len(sentences))),
	"sentence_scores": None
	}

	tfidf_matrix = self.vectorizer.transform(sentences)
	sentence_scores = tfidf_matrix.sum(axis=1).A1

	top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1]
	top_sentences = [sentences[i] for i in sorted(top_indices)]

	return {
	"summary": " ".join(top_sentences),
	"original_sentence_count": int(len(sentences)),
	"summary_sentence_count": int(len(top_sentences)),
	"sentences": sentences,
	"selected_indices": [int(i) for i in sorted(top_indices)],
	"sentence_scores": sentence_scores.tolist(),
	"top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)]
	}

	def get_sentence_analysis(self, text: str) -> Dict[str, Any]:
	"""Get detailed analysis of all sentences with scores and rankings."""
	cleaned_text = preprocess_for_summarization(text)

	sentences = re.split(r"[.!؟\n]+", cleaned_text)
	sentences = [s.strip() for s in sentences if s.strip()]

	if not sentences:
	return {"error": "No sentences found in text"}

	tfidf_matrix = self.vectorizer.transform(sentences)
	sentence_scores = tfidf_matrix.sum(axis=1).A1

	sentence_analysis = []
	for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)):
	sentence_analysis.append({
	"index": int(i),
	"sentence": sentence,
	"score": float(score),
	"rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1)
	})

	return {
	"sentences": sentence_analysis,
	"total_sentences": int(len(sentences)),
	"score_statistics": {
	"mean": float(np.mean(sentence_scores)),
	"std": float(np.std(sentence_scores)),
	"min": float(np.min(sentence_scores)),
	"max": float(np.max(sentence_scores))
	}
	}