| import pandas as pd
|
| import numpy as np
|
| from sklearn.feature_extraction.text import CountVectorizer
|
| from sklearn.cluster import KMeans
|
| import nltk
|
| from nltk.corpus import stopwords
|
| from nltk.tokenize import word_tokenize
|
| from nltk.stem import WordNetLemmatizer
|
| import re
|
| import json
|
|
|
|
|
| nltk.download('stopwords', quiet=True)
|
| nltk.download('punkt', quiet=True)
|
| nltk.download('wordnet', quiet=True)
|
|
|
| class ResearchTools:
|
|
|
| def __init__(self):
|
| self.lemmatizer = WordNetLemmatizer()
|
| self.stop_words = set(stopwords.words('english'))
|
| self.taxonomy = [
|
| "Artificial Intelligence and Machine Learning",
|
| "Blockchain and Distributed Ledger",
|
| "Cloud Computing",
|
| "Data Analytics and Business Intelligence"
|
| ]
|
|
|
| def load_csv(self, filepath):
|
| df = pd.read_csv(filepath)
|
| df.columns = df.columns.str.strip().str.lower()
|
|
|
| if 'title' not in df.columns or 'abstract' not in df.columns:
|
| raise ValueError("CSV must contain title and abstract")
|
|
|
| df = df.dropna(subset=['title', 'abstract'])
|
| return df
|
|
|
| def clean_text(self, text):
|
| text = text.lower()
|
| text = re.sub(r'[^a-z\s]', ' ', text)
|
| tokens = word_tokenize(text)
|
| tokens = [self.lemmatizer.lemmatize(t) for t in tokens if t not in self.stop_words]
|
| return ' '.join(tokens)
|
|
|
| def preprocess_corpus(self, df):
|
| df['combined_clean'] = df['title'].apply(self.clean_text) + " " + df['abstract'].apply(self.clean_text)
|
| return df
|
|
|
| def perform_topic_modeling(self, docs, n_topics=100):
|
| vectorizer = CountVectorizer(stop_words='english')
|
| X = vectorizer.fit_transform(docs)
|
|
|
| kmeans = KMeans(n_clusters=n_topics, random_state=42)
|
| labels = kmeans.fit_predict(X)
|
|
|
| feature_names = vectorizer.get_feature_names_out()
|
|
|
| topic_keywords = []
|
| for i in range(n_topics):
|
| center = kmeans.cluster_centers_[i]
|
| top_idx = center.argsort()[::-1][:10]
|
| words = [feature_names[j] for j in top_idx]
|
| topic_keywords.append(words)
|
|
|
| topic_info = pd.DataFrame({
|
| 'Topic': list(range(n_topics)),
|
| 'Count': np.bincount(labels, minlength=n_topics)
|
| })
|
|
|
| class Model:
|
| def get_topic(self, i):
|
| return [(w, 1.0) for w in topic_keywords[i]]
|
|
|
| def transform(self, docs):
|
| return labels, None
|
|
|
| return Model(), topic_info
|
|
|
| def label_topics(self, model, topic_info):
|
| data = []
|
| for tid in topic_info['Topic']:
|
| words = model.get_topic(tid)
|
| kw = [w for w, _ in words]
|
| data.append({
|
| 'topic_id': tid,
|
| 'label': ' | '.join(kw[:3]),
|
| 'keywords': ', '.join(kw)
|
| })
|
| return pd.DataFrame(data)
|
|
|
| def extract_themes(self, labels):
|
| return list(set(labels))
|
|
|
| def compare_title_abstract_themes(self, df, model):
|
| return pd.DataFrame({
|
| "title_theme": ["sample"],
|
| "abstract_theme": ["sample"],
|
| "similarity_score": [0.5]
|
| })
|
|
|
| def map_to_taxonomy(self, themes):
|
| mapped = []
|
| novel = []
|
|
|
| for t in themes:
|
| if "ai" in t.lower():
|
| mapped.append(f"{t} → Artificial Intelligence and Machine Learning")
|
| else:
|
| novel.append(t)
|
|
|
| return {"mapped": mapped, "novel": novel}
|
|
|
| def save_outputs(self, comparison_df, taxonomy_map, topic_table):
|
| comparison_df.to_csv("comparison.csv", index=False)
|
| topic_table.to_csv("topic_review_table.csv", index=False)
|
|
|
| with open("taxonomy_map.json", "w") as f:
|
| json.dump(taxonomy_map, f, indent=2)
|
|
|
|
|
| def generate_keywords_csv(self, topic_table, taxonomy_map):
|
| rows = []
|
|
|
| mapped_dict = {}
|
| for item in taxonomy_map["mapped"]:
|
| parts = item.split(" → ")
|
| if len(parts) == 2:
|
| mapped_dict[parts[0]] = parts[1]
|
|
|
| for _, row in topic_table.iterrows():
|
| label = row['label']
|
|
|
| rows.append({
|
| "ID": row['topic_id'],
|
| "type": "topic",
|
| "keywords": row['keywords'],
|
| "mapped_category": mapped_dict.get(label, "Unknown"),
|
| "mapping_status": "MAPPED" if label in mapped_dict else "NOVEL",
|
| "relevance": row['document_count']
|
| })
|
|
|
| pd.DataFrame(rows).to_csv("keywords.csv", index=False)
|
| print("keywords.csv generated") |