File size: 1,173 Bytes
6df4ebe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import re, numpy as np
from typing import List
from embedder import embed_texts

def split_sentences(text: str) -> List[str]:
    sents = re.split(r'(?<=[\.\!\?])\s+', text.strip())
    return [s.strip() for s in sents if s.strip()]

def mmr_summarize(text: str, max_sentences: int = 4, diversity: float = 0.6) -> str:
    sents = split_sentences(text)
    if not sents: return text.strip()
    if len(sents) <= max_sentences: return " ".join(sents)
    embs = embed_texts(sents)
    embs = np.array(embs)
    centroid = embs.mean(axis=0)
    centroid = centroid / (np.linalg.norm(centroid) + 1e-8)
    selected = [int(np.argmax(embs @ centroid))]
    while len(selected) < max_sentences:
        best, idx = -1e9, None
        for i in range(len(sents)):
            if i in selected: continue
            rel = float(embs[i] @ centroid)
            red = max(float(embs[i] @ embs[j]) for j in selected) if selected else 0.0
            score = diversity*rel - (1-diversity)*red
            if score > best:
                best, idx = score, i
        if idx is None: break
        selected.append(idx)
    selected.sort()
    return " ".join(sents[i] for i in selected)