Update tools.py
Browse files
tools.py
CHANGED
|
@@ -2222,20 +2222,46 @@ def extract_methods_from_pdfs(pdf_dir: str) -> dict:
|
|
| 2222 |
|
| 2223 |
canonical_patterns = [
|
| 2224 |
(re.compile(r"\bbert\b"), "BERT"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2225 |
(re.compile(r"\bword2vec\b"), "Word2Vec"),
|
| 2226 |
(re.compile(r"\bglove\b"), "GloVe"),
|
|
|
|
|
|
|
| 2227 |
(re.compile(r"\bspecter\b"), "SPECTER"),
|
| 2228 |
(re.compile(r"\bsentence[- ]?transformer"), "Sentence-Transformers"),
|
|
|
|
|
|
|
|
|
|
| 2229 |
(re.compile(r"\blda\b|\blatent dirichlet allocation\b"), "LDA topic modeling"),
|
| 2230 |
(re.compile(r"\bnmf\b|\bnon[- ]?negative matrix factorization\b"), "NMF topic modeling"),
|
|
|
|
| 2231 |
(re.compile(r"\bbertopic\b"), "BERTopic"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2232 |
(re.compile(r"\bner\b|\bnamed entity recognition\b"), "Named entity recognition"),
|
| 2233 |
(re.compile(r"\bsentiment\b"), "Sentiment analysis"),
|
| 2234 |
(re.compile(r"\brandom forest\b"), "Random Forest"),
|
| 2235 |
(re.compile(r"\bdecision tree\b"), "Decision Tree"),
|
| 2236 |
(re.compile(r"\bgradient boosting\b|\bxgboost\b|\blightgbm\b|\bcatboost\b"), "Gradient boosting"),
|
| 2237 |
(re.compile(r"\bsvm\b|\bsupport vector machine\b"), "SVM"),
|
|
|
|
|
|
|
| 2238 |
(re.compile(r"\bneural network\b|\bdeep learning\b|\bmlp\b"), "Neural networks"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2239 |
(re.compile(r"\btransformer\b"), "Transformers"),
|
| 2240 |
(re.compile(r"\bfine[- ]?tuning\b"), "Model fine-tuning"),
|
| 2241 |
(re.compile(r"\bpls[- ]?sem\b|\bpartial least squares\b"), "PLS-SEM"),
|
|
@@ -2248,19 +2274,68 @@ def extract_methods_from_pdfs(pdf_dir: str) -> dict:
|
|
| 2248 |
(re.compile(r"\bmoderation\b"), "Moderation analysis"),
|
| 2249 |
(re.compile(r"\bchi[- ]?square\b|\bchi square\b"), "Chi-square test"),
|
| 2250 |
(re.compile(r"\banova\b"), "ANOVA"),
|
|
|
|
|
|
|
|
|
|
| 2251 |
(re.compile(r"\bt[- ]?test\b"), "t-test"),
|
|
|
|
|
|
|
| 2252 |
(re.compile(r"\bfactor analysis\b"), "Factor analysis"),
|
| 2253 |
(re.compile(r"\btime[- ]?series\b"), "Time-series analysis"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2254 |
(re.compile(r"\blogistic regression\b"), "Logistic regression"),
|
| 2255 |
(re.compile(r"\bols\b|\borderinary least squares\b|\blinear regression\b|\bmultiple regression\b"), "Linear regression (OLS)"),
|
|
|
|
|
|
|
|
|
|
| 2256 |
(re.compile(r"\bregression\b"), "Regression"),
|
| 2257 |
(re.compile(r"\bcentrality\b"), "Network centrality"),
|
| 2258 |
(re.compile(r"\bcommunity detection\b|\blouvain\b|\bleiden\b"), "Community detection"),
|
| 2259 |
(re.compile(r"\bergm\b|\bexponential random graph\b"), "ERGM"),
|
| 2260 |
(re.compile(r"\blink prediction\b"), "Link prediction"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2261 |
(re.compile(r"\bagent[- ]?based\b"), "Agent-based simulation"),
|
| 2262 |
(re.compile(r"\bmonte carlo\b"), "Monte Carlo simulation"),
|
| 2263 |
(re.compile(r"\bbayesian optimization\b"), "Bayesian optimization"),
|
|
|
|
|
|
|
|
|
|
| 2264 |
]
|
| 2265 |
|
| 2266 |
def _canonicalize_technique(name: str) -> tuple[str, str]:
|
|
@@ -2273,24 +2348,58 @@ def extract_methods_from_pdfs(pdf_dir: str) -> dict:
|
|
| 2273 |
return display, display.lower()
|
| 2274 |
|
| 2275 |
category_patterns = [
|
| 2276 |
-
(re.compile(r"\b(bert|transformer|fine[- ]?tuning)\b"), "Transformers"),
|
| 2277 |
-
(re.compile(r"\b(word2vec|glove|specter|sentence[- ]?transformer|embedding)\b"), "Embeddings"),
|
| 2278 |
-
(re.compile(r"\b(topic modeling|lda|nmf|bertopic)\b"), "Topic Modeling"),
|
| 2279 |
-
(re.compile(r"\b(
|
| 2280 |
-
(re.compile(r"\b(
|
| 2281 |
-
(re.compile(r"\b(
|
| 2282 |
-
(re.compile(r"\b(
|
| 2283 |
-
(re.compile(r"\b(
|
| 2284 |
-
(re.compile(r"\b(
|
| 2285 |
-
(re.compile(r"\b(
|
| 2286 |
-
(re.compile(r"\b(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2287 |
]
|
| 2288 |
|
| 2289 |
-
def _categorize_technique(
|
| 2290 |
-
|
| 2291 |
-
|
| 2292 |
-
|
| 2293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2294 |
return "Other"
|
| 2295 |
|
| 2296 |
category_map: dict[str, dict[str, object]] = {}
|
|
@@ -2311,7 +2420,7 @@ def extract_methods_from_pdfs(pdf_dir: str) -> dict:
|
|
| 2311 |
algorithm, _ = _canonicalize_technique(technique)
|
| 2312 |
if not algorithm:
|
| 2313 |
continue
|
| 2314 |
-
category = _categorize_technique(technique)
|
| 2315 |
key = category.lower()
|
| 2316 |
if key not in category_map:
|
| 2317 |
category_map[key] = {
|
|
@@ -2361,4 +2470,4 @@ ALL_TOOLS = [
|
|
| 2361 |
generate_comparison_csv,
|
| 2362 |
export_narrative,
|
| 2363 |
extract_methods_from_pdfs,
|
| 2364 |
-
]
|
|
|
|
| 2222 |
|
| 2223 |
canonical_patterns = [
|
| 2224 |
(re.compile(r"\bbert\b"), "BERT"),
|
| 2225 |
+
(re.compile(r"\broberta\b"), "RoBERTa"),
|
| 2226 |
+
(re.compile(r"\bxlm[- ]?roberta\b"), "XLM-RoBERTa"),
|
| 2227 |
+
(re.compile(r"\bgpt[- ]?[0-9]*\b"), "GPT"),
|
| 2228 |
+
(re.compile(r"\bt5\b"), "T5"),
|
| 2229 |
(re.compile(r"\bword2vec\b"), "Word2Vec"),
|
| 2230 |
(re.compile(r"\bglove\b"), "GloVe"),
|
| 2231 |
+
(re.compile(r"\bdoc2vec\b"), "Doc2Vec"),
|
| 2232 |
+
(re.compile(r"\bfasttext\b"), "fastText"),
|
| 2233 |
(re.compile(r"\bspecter\b"), "SPECTER"),
|
| 2234 |
(re.compile(r"\bsentence[- ]?transformer"), "Sentence-Transformers"),
|
| 2235 |
+
(re.compile(r"\btf[- ]?idf\b"), "TF-IDF"),
|
| 2236 |
+
(re.compile(r"\bbm25\b"), "BM25"),
|
| 2237 |
+
(re.compile(r"\bbag of words\b|\bbow\b"), "Bag-of-words"),
|
| 2238 |
(re.compile(r"\blda\b|\blatent dirichlet allocation\b"), "LDA topic modeling"),
|
| 2239 |
(re.compile(r"\bnmf\b|\bnon[- ]?negative matrix factorization\b"), "NMF topic modeling"),
|
| 2240 |
+
(re.compile(r"\blsa\b|\blsi\b|\blatent semantic analysis\b"), "LSA"),
|
| 2241 |
(re.compile(r"\bbertopic\b"), "BERTopic"),
|
| 2242 |
+
(re.compile(r"\bk[- ]?means\b"), "K-means clustering"),
|
| 2243 |
+
(re.compile(r"\bhierarchical clustering\b"), "Hierarchical clustering"),
|
| 2244 |
+
(re.compile(r"\bdbscan\b"), "DBSCAN"),
|
| 2245 |
+
(re.compile(r"\bhdbscan\b"), "HDBSCAN"),
|
| 2246 |
+
(re.compile(r"\bgmm\b|\bgaussian mixture\b"), "Gaussian mixture model"),
|
| 2247 |
+
(re.compile(r"\bpca\b|\bprincipal component analysis\b"), "PCA"),
|
| 2248 |
+
(re.compile(r"\bsvd\b|\bsingular value decomposition\b"), "SVD"),
|
| 2249 |
+
(re.compile(r"\btsne\b|\bt-sne\b"), "t-SNE"),
|
| 2250 |
+
(re.compile(r"\bumap\b"), "UMAP"),
|
| 2251 |
(re.compile(r"\bner\b|\bnamed entity recognition\b"), "Named entity recognition"),
|
| 2252 |
(re.compile(r"\bsentiment\b"), "Sentiment analysis"),
|
| 2253 |
(re.compile(r"\brandom forest\b"), "Random Forest"),
|
| 2254 |
(re.compile(r"\bdecision tree\b"), "Decision Tree"),
|
| 2255 |
(re.compile(r"\bgradient boosting\b|\bxgboost\b|\blightgbm\b|\bcatboost\b"), "Gradient boosting"),
|
| 2256 |
(re.compile(r"\bsvm\b|\bsupport vector machine\b"), "SVM"),
|
| 2257 |
+
(re.compile(r"\bknn\b|\bk[- ]?nearest neighbor\b"), "KNN"),
|
| 2258 |
+
(re.compile(r"\bnaive bayes\b"), "Naive Bayes"),
|
| 2259 |
(re.compile(r"\bneural network\b|\bdeep learning\b|\bmlp\b"), "Neural networks"),
|
| 2260 |
+
(re.compile(r"\bcnn\b|\bconvolutional neural network\b"), "CNN"),
|
| 2261 |
+
(re.compile(r"\brnn\b|\brecurrent neural network\b"), "RNN"),
|
| 2262 |
+
(re.compile(r"\blstm\b"), "LSTM"),
|
| 2263 |
+
(re.compile(r"\bgru\b"), "GRU"),
|
| 2264 |
+
(re.compile(r"\bautoencoder\b"), "Autoencoder"),
|
| 2265 |
(re.compile(r"\btransformer\b"), "Transformers"),
|
| 2266 |
(re.compile(r"\bfine[- ]?tuning\b"), "Model fine-tuning"),
|
| 2267 |
(re.compile(r"\bpls[- ]?sem\b|\bpartial least squares\b"), "PLS-SEM"),
|
|
|
|
| 2274 |
(re.compile(r"\bmoderation\b"), "Moderation analysis"),
|
| 2275 |
(re.compile(r"\bchi[- ]?square\b|\bchi square\b"), "Chi-square test"),
|
| 2276 |
(re.compile(r"\banova\b"), "ANOVA"),
|
| 2277 |
+
(re.compile(r"\bmanova\b"), "MANOVA"),
|
| 2278 |
+
(re.compile(r"\bancova\b"), "ANCOVA"),
|
| 2279 |
+
(re.compile(r"\bmancova\b"), "MANCOVA"),
|
| 2280 |
(re.compile(r"\bt[- ]?test\b"), "t-test"),
|
| 2281 |
+
(re.compile(r"\bwilcoxon\b"), "Wilcoxon test"),
|
| 2282 |
+
(re.compile(r"\bkruskal[- ]?wallis\b"), "Kruskal-Wallis test"),
|
| 2283 |
(re.compile(r"\bfactor analysis\b"), "Factor analysis"),
|
| 2284 |
(re.compile(r"\btime[- ]?series\b"), "Time-series analysis"),
|
| 2285 |
+
(re.compile(r"\barima\b"), "ARIMA"),
|
| 2286 |
+
(re.compile(r"\bsarima\b"), "SARIMA"),
|
| 2287 |
+
(re.compile(r"\bvar\b|\bvector autoregression\b"), "VAR"),
|
| 2288 |
+
(re.compile(r"\bprophet\b"), "Prophet"),
|
| 2289 |
+
(re.compile(r"\bpanel regression\b|\bpanel data\b"), "Panel regression"),
|
| 2290 |
+
(re.compile(r"\bfixed effects\b"), "Fixed-effects regression"),
|
| 2291 |
+
(re.compile(r"\brandom effects\b"), "Random-effects regression"),
|
| 2292 |
+
(re.compile(r"\bmultilevel\b|\bhierarchical linear model\b|\bhlm\b|\bmixed effects\b"), "Multilevel / mixed-effects regression"),
|
| 2293 |
+
(re.compile(r"\bglm\b|\bgeneralized linear model\b"), "Generalized linear model"),
|
| 2294 |
+
(re.compile(r"\bgls\b|\bgeneralized least squares\b"), "Generalized least squares"),
|
| 2295 |
+
(re.compile(r"\bgee\b|\bgeneralized estimating equation\b"), "GEE"),
|
| 2296 |
+
(re.compile(r"\bgmm\b|\bgeneralized method of moments\b"), "GMM"),
|
| 2297 |
+
(re.compile(r"\b2sls\b|\btwo[- ]?stage least squares\b"), "2SLS"),
|
| 2298 |
+
(re.compile(r"\b3sls\b|\bthree[- ]?stage least squares\b"), "3SLS"),
|
| 2299 |
+
(re.compile(r"\binstrumental variable\b|\biv\b"), "Instrumental variables"),
|
| 2300 |
+
(re.compile(r"\btobit\b"), "Tobit regression"),
|
| 2301 |
+
(re.compile(r"\bheckman\b"), "Heckman selection model"),
|
| 2302 |
+
(re.compile(r"\bpoisson\b"), "Poisson regression"),
|
| 2303 |
+
(re.compile(r"\bnegative binomial\b"), "Negative binomial regression"),
|
| 2304 |
+
(re.compile(r"\bprobit\b"), "Probit regression"),
|
| 2305 |
+
(re.compile(r"\bsurvival analysis\b|\bcox\b|\bhazard model\b|\bkaplan[- ]?meier\b"), "Survival analysis"),
|
| 2306 |
+
(re.compile(r"\blatent class analysis\b|\blca\b"), "Latent class analysis"),
|
| 2307 |
+
(re.compile(r"\blatent profile analysis\b|\blpa\b"), "Latent profile analysis"),
|
| 2308 |
(re.compile(r"\blogistic regression\b"), "Logistic regression"),
|
| 2309 |
(re.compile(r"\bols\b|\borderinary least squares\b|\blinear regression\b|\bmultiple regression\b"), "Linear regression (OLS)"),
|
| 2310 |
+
(re.compile(r"\bridge regression\b|\bridge\b"), "Ridge regression"),
|
| 2311 |
+
(re.compile(r"\blasso\b"), "LASSO regression"),
|
| 2312 |
+
(re.compile(r"\belastic net\b"), "Elastic Net regression"),
|
| 2313 |
(re.compile(r"\bregression\b"), "Regression"),
|
| 2314 |
(re.compile(r"\bcentrality\b"), "Network centrality"),
|
| 2315 |
(re.compile(r"\bcommunity detection\b|\blouvain\b|\bleiden\b"), "Community detection"),
|
| 2316 |
(re.compile(r"\bergm\b|\bexponential random graph\b"), "ERGM"),
|
| 2317 |
(re.compile(r"\blink prediction\b"), "Link prediction"),
|
| 2318 |
+
(re.compile(r"\bpagerank\b|\bpage rank\b"), "PageRank"),
|
| 2319 |
+
(re.compile(r"\bgraph neural network\b|\bgnn\b"), "Graph neural networks"),
|
| 2320 |
+
(re.compile(r"\bhidden markov\b|\bhmm\b"), "Hidden Markov Model"),
|
| 2321 |
+
(re.compile(r"\bmarkov chain\b|\bmarkov model\b"), "Markov models"),
|
| 2322 |
+
(re.compile(r"\bkalman filter\b"), "Kalman filter"),
|
| 2323 |
+
(re.compile(r"\bstate[- ]?space\b"), "State-space models"),
|
| 2324 |
+
(re.compile(r"\bhawkes\b"), "Hawkes process"),
|
| 2325 |
+
(re.compile(r"\brecommender\b|\bcollaborative filtering\b|\bmatrix factorization\b"), "Recommender systems"),
|
| 2326 |
+
(re.compile(r"\bahp\b|\banalytic hierarchy process\b"), "AHP"),
|
| 2327 |
+
(re.compile(r"\btopsis\b"), "TOPSIS"),
|
| 2328 |
+
(re.compile(r"\bvikor\b"), "VIKOR"),
|
| 2329 |
+
(re.compile(r"\bpromethee\b"), "PROMETHEE"),
|
| 2330 |
+
(re.compile(r"\bdematel\b"), "DEMATEL"),
|
| 2331 |
+
(re.compile(r"\bdea\b|\bdata envelopment analysis\b"), "DEA"),
|
| 2332 |
+
(re.compile(r"\bsfa\b|\bstochastic frontier\b"), "SFA"),
|
| 2333 |
(re.compile(r"\bagent[- ]?based\b"), "Agent-based simulation"),
|
| 2334 |
(re.compile(r"\bmonte carlo\b"), "Monte Carlo simulation"),
|
| 2335 |
(re.compile(r"\bbayesian optimization\b"), "Bayesian optimization"),
|
| 2336 |
+
(re.compile(r"\blinear programming\b|\binteger programming\b|\bmixed integer\b"), "Mathematical optimization"),
|
| 2337 |
+
(re.compile(r"\bgenetic algorithm\b"), "Genetic algorithms"),
|
| 2338 |
+
(re.compile(r"\bsimulated annealing\b"), "Simulated annealing"),
|
| 2339 |
]
|
| 2340 |
|
| 2341 |
def _canonicalize_technique(name: str) -> tuple[str, str]:
|
|
|
|
| 2348 |
return display, display.lower()
|
| 2349 |
|
| 2350 |
category_patterns = [
|
| 2351 |
+
(re.compile(r"\b(bert|roberta|xlm roberta|gpt|t5|transformer|fine[- ]?tuning)\b"), "Transformers"),
|
| 2352 |
+
(re.compile(r"\b(word2vec|glove|doc2vec|fasttext|specter|sentence[- ]?transformer|embedding|tf[- ]?idf|bm25|bag of words|bow)\b"), "Embeddings / Representation"),
|
| 2353 |
+
(re.compile(r"\b(topic modeling|lda|nmf|bertopic|lsa|lsi)\b"), "Topic Modeling"),
|
| 2354 |
+
(re.compile(r"\b(k[- ]?means|hierarchical clustering|dbscan|hdbscan|gaussian mixture|gmm|clustering)\b"), "Clustering"),
|
| 2355 |
+
(re.compile(r"\b(pca|svd|t-sne|tsne|umap|dimensionality reduction)\b"), "Dimensionality Reduction"),
|
| 2356 |
+
(re.compile(r"\b(arima|sarima|var|prophet|time[- ]?series)\b"), "Time Series / Forecasting"),
|
| 2357 |
+
(re.compile(r"\b(panel data|panel regression|fixed effects|random effects|multilevel|hierarchical linear model|hlm|mixed effects|glm|gls|gee|gmm|2sls|3sls|instrumental variable|tobit|heckman|poisson|negative binomial|probit|logit)\b"), "Econometric / Panel Models"),
|
| 2358 |
+
(re.compile(r"\b(ols|linear regression|logistic regression|ridge|lasso|elastic net|regression)\b"), "Regression"),
|
| 2359 |
+
(re.compile(r"\b(sem|pls[- ]?sem|cb[- ]?sem|structural equation|cfa|efa)\b"), "SEM"),
|
| 2360 |
+
(re.compile(r"\b(latent class analysis|latent profile analysis|latent variable|mixture model)\b"), "Latent Variable Models"),
|
| 2361 |
+
(re.compile(r"\b(grad(ient)? boosting|xgboost|lightgbm|catboost)\b"), "Boosting / Ensembles"),
|
| 2362 |
+
(re.compile(r"\b(random forest|decision tree|svm|knn|naive bayes)\b"), "Classic ML"),
|
| 2363 |
+
(re.compile(r"\b(neural network|deep learning|lstm|cnn|rnn|gru|mlp|autoencoder)\b"), "Deep Learning"),
|
| 2364 |
+
(re.compile(r"\b(ner|named entity recognition|sentiment|nlp|text mining|tokenization|stemming|lemmatization|keyword extraction)\b"), "NLP / Text Mining"),
|
| 2365 |
+
(re.compile(r"\b(network|centrality|community detection|louvain|leiden|ergm|link prediction|pagerank|graph neural network|gnn)\b"), "Network Analysis"),
|
| 2366 |
+
(re.compile(r"\b(agent[- ]?based|monte carlo|bayesian optimization|linear programming|integer programming|genetic algorithm|simulated annealing)\b"), "Simulation / Optimization"),
|
| 2367 |
+
(re.compile(r"\b(survival|cox|hazard|kaplan[- ]?meier)\b"), "Survival / Event History"),
|
| 2368 |
+
(re.compile(r"\b(bayesian|mcmc|gibbs|variational)\b"), "Bayesian Methods"),
|
| 2369 |
+
(re.compile(r"\b(anova|manova|ancova|mancova|t[- ]?test|chi[- ]?square|factor analysis|glmm|irt|mediation|moderation|wilcoxon|kruskal[- ]?wallis)\b"), "Statistical Tests / Models"),
|
| 2370 |
+
(re.compile(r"\b(difference[- ]?in[- ]?differences|did|regression discontinuity|rdd|instrumental variable|iv|propensity score|matching)\b"), "Causal Inference"),
|
| 2371 |
+
(re.compile(r"\b(recommender|collaborative filtering|matrix factorization)\b"), "Recommender Systems"),
|
| 2372 |
+
(re.compile(r"\b(hidden markov|hmm|markov|kalman|state[- ]?space|hawkes)\b"), "Sequence / Stochastic Processes"),
|
| 2373 |
+
(re.compile(r"\b(ahp|analytic hierarchy process|topsis|vikor|promethee|dematel)\b"), "Decision Analysis / MCDA"),
|
| 2374 |
+
(re.compile(r"\b(dea|data envelopment analysis|stochastic frontier|sfa|frontier analysis)\b"), "Efficiency / Frontier Analysis"),
|
| 2375 |
]
|
| 2376 |
|
| 2377 |
+
def _categorize_technique(*names: str) -> str:
|
| 2378 |
+
for name in names:
|
| 2379 |
+
if not name:
|
| 2380 |
+
continue
|
| 2381 |
+
key = _normalize_technique_key(name)
|
| 2382 |
+
for pattern, category in category_patterns:
|
| 2383 |
+
if pattern.search(key):
|
| 2384 |
+
return category
|
| 2385 |
+
fallback_keywords = [
|
| 2386 |
+
("Classic ML", ["classifier", "classification", "predictive model", "prediction", "supervised"]),
|
| 2387 |
+
("Clustering", ["cluster", "clustering"]),
|
| 2388 |
+
("Topic Modeling", ["topic", "semantic"]),
|
| 2389 |
+
("Embeddings / Representation", ["embedding", "vector", "tf idf", "bow", "bag of words"]),
|
| 2390 |
+
("Regression", ["regression", "logit", "probit", "panel", "fixed effects", "random effects", "glm", "gls", "gee", "gmm"]),
|
| 2391 |
+
("SEM", ["sem", "structural equation", "factor", "latent"]),
|
| 2392 |
+
("Bayesian Methods", ["bayesian", "mcmc", "gibbs", "prior", "posterior"]),
|
| 2393 |
+
("Time Series / Forecasting", ["time series", "forecast", "arima", "sarima", "var", "prophet"]),
|
| 2394 |
+
("NLP / Text Mining", ["nlp", "text", "token", "lemma", "stem", "language"]),
|
| 2395 |
+
("Network Analysis", ["network", "graph", "node", "edge"]),
|
| 2396 |
+
("Simulation / Optimization", ["simulation", "optimi", "heuristic", "metaheuristic", "monte carlo", "agent-based"]),
|
| 2397 |
+
]
|
| 2398 |
+
for category, keywords in fallback_keywords:
|
| 2399 |
+
if any(k in key for k in keywords):
|
| 2400 |
+
return category
|
| 2401 |
+
if any(token in key for token in ["model", "analysis", "estimation", "test"]):
|
| 2402 |
+
return "Statistical Tests / Models"
|
| 2403 |
return "Other"
|
| 2404 |
|
| 2405 |
category_map: dict[str, dict[str, object]] = {}
|
|
|
|
| 2420 |
algorithm, _ = _canonicalize_technique(technique)
|
| 2421 |
if not algorithm:
|
| 2422 |
continue
|
| 2423 |
+
category = _categorize_technique(technique, algorithm)
|
| 2424 |
key = category.lower()
|
| 2425 |
if key not in category_map:
|
| 2426 |
category_map[key] = {
|
|
|
|
| 2470 |
generate_comparison_csv,
|
| 2471 |
export_narrative,
|
| 2472 |
extract_methods_from_pdfs,
|
| 2473 |
+
]
|