Update README.md
Browse files
README.md
CHANGED
|
@@ -409,4 +409,132 @@ topic_model.get_topic_info()
|
|
| 409 |
|367|10|367_opioid_morphine_pain_nefopam|opioid,morphine,pain,nefopam,us,epidural,postoperative,intrathecal,analgesia,anesthesia|
|
| 410 |
|368|10|368_lps_macrophages_sepsis_mgmt|lps,macrophages,sepsis,mgmt,mice,cgas,bam15,ezh2,clp,null|
|
| 411 |
|
| 412 |
-
</details>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
|367|10|367_opioid_morphine_pain_nefopam|opioid,morphine,pain,nefopam,us,epidural,postoperative,intrathecal,analgesia,anesthesia|
|
| 410 |
|368|10|368_lps_macrophages_sepsis_mgmt|lps,macrophages,sepsis,mgmt,mice,cgas,bam15,ezh2,clp,null|
|
| 411 |
|
| 412 |
+
</details>
|
| 413 |
+
|
| 414 |
+
## Training Procedure
|
| 415 |
+
|
| 416 |
+
The model was trained as follows:
|
| 417 |
+
|
| 418 |
+
```py
|
| 419 |
+
from bertopic import BERTopic
|
| 420 |
+
|
| 421 |
+
from sentence_transformers import SentenceTransformer
|
| 422 |
+
|
| 423 |
+
from umap import UMAP
|
| 424 |
+
from hdbscan import HDBSCAN
|
| 425 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 426 |
+
from bertopic.representation import PartOfSpeech, KeyBERTInspired, MaximalMarginalRelevance, ZeroShotClassification
|
| 427 |
+
|
| 428 |
+
embedding_model = SentenceTransformer("all-mpnet-base-v2")
|
| 429 |
+
umap_model = UMAP(n_neighbors=25, n_components=5, min_dist=0.0, metric='cosine', random_state=42, verbose=True) #change n_neightbor, n_components, metric
|
| 430 |
+
hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True) #change min_cluster_size, min_samples
|
| 431 |
+
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=5)
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
representation_models = {
|
| 435 |
+
"POS": PartOfSpeech("en_core_web_lg"),
|
| 436 |
+
"KeyBERTInspired": KeyBERTInspired(),
|
| 437 |
+
"MMR": MaximalMarginalRelevance(diversity=0.3),
|
| 438 |
+
"KeyBERT + MMR": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)],
|
| 439 |
+
"Summarization": summarization, # Own Prompted Model as to Summarize.
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
topic_model = BERTopic(
|
| 443 |
+
language="english",
|
| 444 |
+
embedding_model=embedding_model,
|
| 445 |
+
umap_model=umap_model,
|
| 446 |
+
#hdbscan_model=hdbscan_model,
|
| 447 |
+
#vectorizer_model=vectorizer_model,
|
| 448 |
+
representation_model=representation_models,
|
| 449 |
+
verbose=True,
|
| 450 |
+
)
|
| 451 |
+
topics, probs = topic_model.fit_transform(docs)
|
| 452 |
+
```
|
| 453 |
+
|
| 454 |
+
## Create Own Representation Model
|
| 455 |
+
|
| 456 |
+
Using [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) for its lightweightness
|
| 457 |
+
|
| 458 |
+
### Defined Summarization
|
| 459 |
+
|
| 460 |
+
```py
|
| 461 |
+
import torch
|
| 462 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 463 |
+
|
| 464 |
+
torch.random.manual_seed(42)
|
| 465 |
+
|
| 466 |
+
summarization_model = AutoModelForCausalLM.from_pretrained(
|
| 467 |
+
"microsoft/Phi-3-mini-128k-instruct",
|
| 468 |
+
device_map="cuda",
|
| 469 |
+
torch_dtype="auto",
|
| 470 |
+
trust_remote_code=True,
|
| 471 |
+
)
|
| 472 |
+
summarization_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
|
| 473 |
+
|
| 474 |
+
def summarize_with_model(text):
|
| 475 |
+
question = f"""
|
| 476 |
+
I have a document of which abstract and title are given.
|
| 477 |
+
The following documents are a small but representative subset of all documents in the topic:
|
| 478 |
+
{text}
|
| 479 |
+
|
| 480 |
+
Based on the information above, please give a description topic in the following keyword format:
|
| 481 |
+
topic: <description>
|
| 482 |
+
"""
|
| 483 |
+
messages = [
|
| 484 |
+
{"role": "user", "content": question},
|
| 485 |
+
]
|
| 486 |
+
pipe = pipeline(
|
| 487 |
+
"text-generation",
|
| 488 |
+
model=summarization_model,
|
| 489 |
+
tokenizer=summarization_tokenizer,
|
| 490 |
+
)
|
| 491 |
+
generation_args = {
|
| 492 |
+
"max_new_tokens": 128,
|
| 493 |
+
"return_full_text": False,
|
| 494 |
+
"temperature": 0.0,
|
| 495 |
+
"do_sample": False,
|
| 496 |
+
}
|
| 497 |
+
output = pipe(messages, **generation_args)
|
| 498 |
+
return output[0]['generated_text']
|
| 499 |
+
|
| 500 |
+
```
|
| 501 |
+
|
| 502 |
+
Prompt Used,
|
| 503 |
+
|
| 504 |
+
```py
|
| 505 |
+
question = f"""
|
| 506 |
+
I have a document of which abstract and title are given.
|
| 507 |
+
The following documents are a small but representative subset of all documents in the topic:
|
| 508 |
+
{text}
|
| 509 |
+
|
| 510 |
+
Based on the information above, please give a description topic in the following keyword format:
|
| 511 |
+
topic: <description>
|
| 512 |
+
"""
|
| 513 |
+
```
|
| 514 |
+
|
| 515 |
+
**NOTE: Persuation with other better propmt is recommended**
|
| 516 |
+
|
| 517 |
+
### Mounted on Base-Representation
|
| 518 |
+
|
| 519 |
+
```py
|
| 520 |
+
from bertopic.representation._base import BaseRepresentation
|
| 521 |
+
from typing import List, Mapping, Tuple
|
| 522 |
+
|
| 523 |
+
class SummarizationRepresentation(BaseRepresentation):
|
| 524 |
+
def __init__(self, summarization_model, summarization_tokenizer):
|
| 525 |
+
self.summarization_model = summarization_model
|
| 526 |
+
self.summarization_tokenizer = summarization_tokenizer
|
| 527 |
+
|
| 528 |
+
def extract_topics(self, topic_model, documents, c_tf_idf, topics
|
| 529 |
+
) -> Mapping[str, List[Tuple[str, float]]]:
|
| 530 |
+
updated_topics = {}
|
| 531 |
+
for topic_id, words in topics.items():
|
| 532 |
+
# Extract only the words from the tuples
|
| 533 |
+
words_only = [word[0] for word in words]
|
| 534 |
+
text = " ".join(words_only)
|
| 535 |
+
summary = summarize_with_model(text)
|
| 536 |
+
updated_topics[topic_id] = [(summary, 1.0)]
|
| 537 |
+
return updated_topics
|
| 538 |
+
|
| 539 |
+
summarization = SummarizationRepresentation(summarization_model, summarization_tokenizer)
|
| 540 |
+
```
|