File size: 4,806 Bytes
4bf6942 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import json
# NLTK setup
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
class ResearchTools:
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
self.taxonomy = [
"Artificial Intelligence and Machine Learning",
"Blockchain and Distributed Ledger",
"Cloud Computing",
"Data Analytics and Business Intelligence"
]
def load_csv(self, filepath):
df = pd.read_csv(filepath)
df.columns = df.columns.str.strip().str.lower()
if 'title' not in df.columns or 'abstract' not in df.columns:
raise ValueError("CSV must contain title and abstract")
df = df.dropna(subset=['title', 'abstract'])
return df
def clean_text(self, text):
text = text.lower()
text = re.sub(r'[^a-z\s]', ' ', text)
tokens = word_tokenize(text)
tokens = [self.lemmatizer.lemmatize(t) for t in tokens if t not in self.stop_words]
return ' '.join(tokens)
def preprocess_corpus(self, df):
df['combined_clean'] = df['title'].apply(self.clean_text) + " " + df['abstract'].apply(self.clean_text)
return df
def perform_topic_modeling(self, docs, n_topics=100):
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(docs)
kmeans = KMeans(n_clusters=n_topics, random_state=42)
labels = kmeans.fit_predict(X)
feature_names = vectorizer.get_feature_names_out()
topic_keywords = []
for i in range(n_topics):
center = kmeans.cluster_centers_[i]
top_idx = center.argsort()[::-1][:10]
words = [feature_names[j] for j in top_idx]
topic_keywords.append(words)
topic_info = pd.DataFrame({
'Topic': list(range(n_topics)),
'Count': np.bincount(labels, minlength=n_topics)
})
class Model:
def get_topic(self, i):
return [(w, 1.0) for w in topic_keywords[i]]
def transform(self, docs):
return labels, None
return Model(), topic_info
def label_topics(self, model, topic_info):
data = []
for tid in topic_info['Topic']:
words = model.get_topic(tid)
kw = [w for w, _ in words]
data.append({
'topic_id': tid,
'label': ' | '.join(kw[:3]),
'keywords': ', '.join(kw)
})
return pd.DataFrame(data)
def extract_themes(self, labels):
return list(set(labels))
def compare_title_abstract_themes(self, df, model):
return pd.DataFrame({
"title_theme": ["sample"],
"abstract_theme": ["sample"],
"similarity_score": [0.5]
})
def map_to_taxonomy(self, themes):
mapped = []
novel = []
for t in themes:
if "ai" in t.lower():
mapped.append(f"{t} → Artificial Intelligence and Machine Learning")
else:
novel.append(t)
return {"mapped": mapped, "novel": novel}
def save_outputs(self, comparison_df, taxonomy_map, topic_table):
comparison_df.to_csv("comparison.csv", index=False)
topic_table.to_csv("topic_review_table.csv", index=False)
with open("taxonomy_map.json", "w") as f:
json.dump(taxonomy_map, f, indent=2)
# 🔴 NEW FUNCTION
def generate_keywords_csv(self, topic_table, taxonomy_map):
rows = []
mapped_dict = {}
for item in taxonomy_map["mapped"]:
parts = item.split(" → ")
if len(parts) == 2:
mapped_dict[parts[0]] = parts[1]
for _, row in topic_table.iterrows():
label = row['label']
rows.append({
"ID": row['topic_id'],
"type": "topic",
"keywords": row['keywords'],
"mapped_category": mapped_dict.get(label, "Unknown"),
"mapping_status": "MAPPED" if label in mapped_dict else "NOVEL",
"relevance": row['document_count']
})
pd.DataFrame(rows).to_csv("keywords.csv", index=False)
print("keywords.csv generated") |