File size: 1,261 Bytes
542a9ad
 
 
fd52f78
 
1f81155
fd52f78
 
 
 
 
 
 
 
 
 
 
c4e42f8
fd52f78
c4e42f8
 
 
fd52f78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import sys
sys.modules["bertopic.plotting"] = None  # Blocks plotting from loading

from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from bertopic._bertopic import BERTopic  # internal module that avoids plotting
from sentence_transformers import SentenceTransformer
import uvicorn

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"]
)

# Czech-capable multilingual model
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# 👇 THIS is the crucial fix: disable UMAP to avoid caching and plotting imports
topic_model = BERTopic(embedding_model=embedding_model, umap_model=None)

@app.post("/segment-topics")
async def segment_topics(request: Request):
    payload = await request.json()
    texts = payload.get("texts", [])
    starts = payload.get("starts", [])
    ends = payload.get("ends", [])

    topics, _ = topic_model.fit_transform(texts)
    output = []
    for i, (text, topic, start, end) in enumerate(zip(texts, topics, starts, ends)):
        output.append({
            "text": text,
            "topic": int(topic),
            "start": start,
            "end": end
        })

    return output