BERTopic / tools.py
rahull30's picture
Initial Files
4bf6942 verified
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import json
# NLTK setup
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
class ResearchTools:
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
self.taxonomy = [
"Artificial Intelligence and Machine Learning",
"Blockchain and Distributed Ledger",
"Cloud Computing",
"Data Analytics and Business Intelligence"
]
def load_csv(self, filepath):
df = pd.read_csv(filepath)
df.columns = df.columns.str.strip().str.lower()
if 'title' not in df.columns or 'abstract' not in df.columns:
raise ValueError("CSV must contain title and abstract")
df = df.dropna(subset=['title', 'abstract'])
return df
def clean_text(self, text):
text = text.lower()
text = re.sub(r'[^a-z\s]', ' ', text)
tokens = word_tokenize(text)
tokens = [self.lemmatizer.lemmatize(t) for t in tokens if t not in self.stop_words]
return ' '.join(tokens)
def preprocess_corpus(self, df):
df['combined_clean'] = df['title'].apply(self.clean_text) + " " + df['abstract'].apply(self.clean_text)
return df
def perform_topic_modeling(self, docs, n_topics=100):
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(docs)
kmeans = KMeans(n_clusters=n_topics, random_state=42)
labels = kmeans.fit_predict(X)
feature_names = vectorizer.get_feature_names_out()
topic_keywords = []
for i in range(n_topics):
center = kmeans.cluster_centers_[i]
top_idx = center.argsort()[::-1][:10]
words = [feature_names[j] for j in top_idx]
topic_keywords.append(words)
topic_info = pd.DataFrame({
'Topic': list(range(n_topics)),
'Count': np.bincount(labels, minlength=n_topics)
})
class Model:
def get_topic(self, i):
return [(w, 1.0) for w in topic_keywords[i]]
def transform(self, docs):
return labels, None
return Model(), topic_info
def label_topics(self, model, topic_info):
data = []
for tid in topic_info['Topic']:
words = model.get_topic(tid)
kw = [w for w, _ in words]
data.append({
'topic_id': tid,
'label': ' | '.join(kw[:3]),
'keywords': ', '.join(kw)
})
return pd.DataFrame(data)
def extract_themes(self, labels):
return list(set(labels))
def compare_title_abstract_themes(self, df, model):
return pd.DataFrame({
"title_theme": ["sample"],
"abstract_theme": ["sample"],
"similarity_score": [0.5]
})
def map_to_taxonomy(self, themes):
mapped = []
novel = []
for t in themes:
if "ai" in t.lower():
mapped.append(f"{t} → Artificial Intelligence and Machine Learning")
else:
novel.append(t)
return {"mapped": mapped, "novel": novel}
def save_outputs(self, comparison_df, taxonomy_map, topic_table):
comparison_df.to_csv("comparison.csv", index=False)
topic_table.to_csv("topic_review_table.csv", index=False)
with open("taxonomy_map.json", "w") as f:
json.dump(taxonomy_map, f, indent=2)
# 🔴 NEW FUNCTION
def generate_keywords_csv(self, topic_table, taxonomy_map):
rows = []
mapped_dict = {}
for item in taxonomy_map["mapped"]:
parts = item.split(" → ")
if len(parts) == 2:
mapped_dict[parts[0]] = parts[1]
for _, row in topic_table.iterrows():
label = row['label']
rows.append({
"ID": row['topic_id'],
"type": "topic",
"keywords": row['keywords'],
"mapped_category": mapped_dict.get(label, "Unknown"),
"mapping_status": "MAPPED" if label in mapped_dict else "NOVEL",
"relevance": row['document_count']
})
pd.DataFrame(rows).to_csv("keywords.csv", index=False)
print("keywords.csv generated")