| import re | |
| import numpy as np | |
| import joblib | |
| from typing import Dict, Any | |
| from preprocessor import preprocess_for_summarization | |
| class ArabicSummarizer: | |
| """Arabic text summarizer using TF-IDF scoring.""" | |
| def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"): | |
| self.vectorizer = joblib.load(vectorizer_path) | |
| def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]: | |
| """Summarize text by selecting top-scored sentences.""" | |
| cleaned_text = preprocess_for_summarization(text) | |
| sentences = re.split(r"[.!؟\n]+", cleaned_text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if len(sentences) <= num_sentences: | |
| return { | |
| "summary": text, | |
| "original_sentence_count": int(len(sentences)), | |
| "summary_sentence_count": int(len(sentences)), | |
| "sentences": sentences, | |
| "selected_indices": list(range(len(sentences))), | |
| "sentence_scores": None | |
| } | |
| tfidf_matrix = self.vectorizer.transform(sentences) | |
| sentence_scores = tfidf_matrix.sum(axis=1).A1 | |
| top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1] | |
| top_sentences = [sentences[i] for i in sorted(top_indices)] | |
| return { | |
| "summary": " ".join(top_sentences), | |
| "original_sentence_count": int(len(sentences)), | |
| "summary_sentence_count": int(len(top_sentences)), | |
| "sentences": sentences, | |
| "selected_indices": [int(i) for i in sorted(top_indices)], | |
| "sentence_scores": sentence_scores.tolist(), | |
| "top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)] | |
| } | |
| def get_sentence_analysis(self, text: str) -> Dict[str, Any]: | |
| """Get detailed analysis of all sentences with scores and rankings.""" | |
| cleaned_text = preprocess_for_summarization(text) | |
| sentences = re.split(r"[.!؟\n]+", cleaned_text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if not sentences: | |
| return {"error": "No sentences found in text"} | |
| tfidf_matrix = self.vectorizer.transform(sentences) | |
| sentence_scores = tfidf_matrix.sum(axis=1).A1 | |
| sentence_analysis = [] | |
| for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)): | |
| sentence_analysis.append({ | |
| "index": int(i), | |
| "sentence": sentence, | |
| "score": float(score), | |
| "rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1) | |
| }) | |
| return { | |
| "sentences": sentence_analysis, | |
| "total_sentences": int(len(sentences)), | |
| "score_statistics": { | |
| "mean": float(np.mean(sentence_scores)), | |
| "std": float(np.std(sentence_scores)), | |
| "min": float(np.min(sentence_scores)), | |
| "max": float(np.max(sentence_scores)) | |
| } | |
| } | |