atharvthite05 commited on
Commit
0d2dcbb
·
verified ·
1 Parent(s): 5052b2c

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +127 -18
tools.py CHANGED
@@ -2222,20 +2222,46 @@ def extract_methods_from_pdfs(pdf_dir: str) -> dict:
2222
 
2223
  canonical_patterns = [
2224
  (re.compile(r"\bbert\b"), "BERT"),
 
 
 
 
2225
  (re.compile(r"\bword2vec\b"), "Word2Vec"),
2226
  (re.compile(r"\bglove\b"), "GloVe"),
 
 
2227
  (re.compile(r"\bspecter\b"), "SPECTER"),
2228
  (re.compile(r"\bsentence[- ]?transformer"), "Sentence-Transformers"),
 
 
 
2229
  (re.compile(r"\blda\b|\blatent dirichlet allocation\b"), "LDA topic modeling"),
2230
  (re.compile(r"\bnmf\b|\bnon[- ]?negative matrix factorization\b"), "NMF topic modeling"),
 
2231
  (re.compile(r"\bbertopic\b"), "BERTopic"),
 
 
 
 
 
 
 
 
 
2232
  (re.compile(r"\bner\b|\bnamed entity recognition\b"), "Named entity recognition"),
2233
  (re.compile(r"\bsentiment\b"), "Sentiment analysis"),
2234
  (re.compile(r"\brandom forest\b"), "Random Forest"),
2235
  (re.compile(r"\bdecision tree\b"), "Decision Tree"),
2236
  (re.compile(r"\bgradient boosting\b|\bxgboost\b|\blightgbm\b|\bcatboost\b"), "Gradient boosting"),
2237
  (re.compile(r"\bsvm\b|\bsupport vector machine\b"), "SVM"),
 
 
2238
  (re.compile(r"\bneural network\b|\bdeep learning\b|\bmlp\b"), "Neural networks"),
 
 
 
 
 
2239
  (re.compile(r"\btransformer\b"), "Transformers"),
2240
  (re.compile(r"\bfine[- ]?tuning\b"), "Model fine-tuning"),
2241
  (re.compile(r"\bpls[- ]?sem\b|\bpartial least squares\b"), "PLS-SEM"),
@@ -2248,19 +2274,68 @@ def extract_methods_from_pdfs(pdf_dir: str) -> dict:
2248
  (re.compile(r"\bmoderation\b"), "Moderation analysis"),
2249
  (re.compile(r"\bchi[- ]?square\b|\bchi square\b"), "Chi-square test"),
2250
  (re.compile(r"\banova\b"), "ANOVA"),
 
 
 
2251
  (re.compile(r"\bt[- ]?test\b"), "t-test"),
 
 
2252
  (re.compile(r"\bfactor analysis\b"), "Factor analysis"),
2253
  (re.compile(r"\btime[- ]?series\b"), "Time-series analysis"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2254
  (re.compile(r"\blogistic regression\b"), "Logistic regression"),
2255
  (re.compile(r"\bols\b|\borderinary least squares\b|\blinear regression\b|\bmultiple regression\b"), "Linear regression (OLS)"),
 
 
 
2256
  (re.compile(r"\bregression\b"), "Regression"),
2257
  (re.compile(r"\bcentrality\b"), "Network centrality"),
2258
  (re.compile(r"\bcommunity detection\b|\blouvain\b|\bleiden\b"), "Community detection"),
2259
  (re.compile(r"\bergm\b|\bexponential random graph\b"), "ERGM"),
2260
  (re.compile(r"\blink prediction\b"), "Link prediction"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2261
  (re.compile(r"\bagent[- ]?based\b"), "Agent-based simulation"),
2262
  (re.compile(r"\bmonte carlo\b"), "Monte Carlo simulation"),
2263
  (re.compile(r"\bbayesian optimization\b"), "Bayesian optimization"),
 
 
 
2264
  ]
2265
 
2266
  def _canonicalize_technique(name: str) -> tuple[str, str]:
@@ -2273,24 +2348,58 @@ def extract_methods_from_pdfs(pdf_dir: str) -> dict:
2273
  return display, display.lower()
2274
 
2275
  category_patterns = [
2276
- (re.compile(r"\b(bert|transformer|fine[- ]?tuning)\b"), "Transformers"),
2277
- (re.compile(r"\b(word2vec|glove|specter|sentence[- ]?transformer|embedding)\b"), "Embeddings"),
2278
- (re.compile(r"\b(topic modeling|lda|nmf|bertopic)\b"), "Topic Modeling"),
2279
- (re.compile(r"\b(ols|linear regression|logistic regression|regression)\b"), "Regression"),
2280
- (re.compile(r"\b(sem|pls[- ]?sem|cb[- ]?sem|structural equation)\b"), "SEM"),
2281
- (re.compile(r"\b(random forest|decision tree|svm|gradient boosting|xgboost|lightgbm|catboost)\b"), "Classic ML"),
2282
- (re.compile(r"\b(neural network|deep learning|lstm|cnn|mlp)\b"), "Deep Learning"),
2283
- (re.compile(r"\b(network|centrality|community detection|louvain|leiden|ergm|link prediction)\b"), "Network Analysis"),
2284
- (re.compile(r"\b(agent[- ]?based|monte carlo|bayesian optimization)\b"), "Simulation / Optimization"),
2285
- (re.compile(r"\b(anova|t[- ]?test|chi[- ]?square|factor analysis|time[- ]?series|glmm|irt|bayesian inference|mediation|moderation)\b"), "Statistical Tests / Models"),
2286
- (re.compile(r"\b(sentiment|ner|named entity recognition|nlp|text mining)\b"), "NLP / Text Mining"),
 
 
 
 
 
 
 
 
 
 
 
 
 
2287
  ]
2288
 
2289
- def _categorize_technique(name: str) -> str:
2290
- key = _normalize_technique_key(name)
2291
- for pattern, category in category_patterns:
2292
- if pattern.search(key):
2293
- return category
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2294
  return "Other"
2295
 
2296
  category_map: dict[str, dict[str, object]] = {}
@@ -2311,7 +2420,7 @@ def extract_methods_from_pdfs(pdf_dir: str) -> dict:
2311
  algorithm, _ = _canonicalize_technique(technique)
2312
  if not algorithm:
2313
  continue
2314
- category = _categorize_technique(technique)
2315
  key = category.lower()
2316
  if key not in category_map:
2317
  category_map[key] = {
@@ -2361,4 +2470,4 @@ ALL_TOOLS = [
2361
  generate_comparison_csv,
2362
  export_narrative,
2363
  extract_methods_from_pdfs,
2364
- ]
 
2222
 
2223
  canonical_patterns = [
2224
  (re.compile(r"\bbert\b"), "BERT"),
2225
+ (re.compile(r"\broberta\b"), "RoBERTa"),
2226
+ (re.compile(r"\bxlm[- ]?roberta\b"), "XLM-RoBERTa"),
2227
+ (re.compile(r"\bgpt[- ]?[0-9]*\b"), "GPT"),
2228
+ (re.compile(r"\bt5\b"), "T5"),
2229
  (re.compile(r"\bword2vec\b"), "Word2Vec"),
2230
  (re.compile(r"\bglove\b"), "GloVe"),
2231
+ (re.compile(r"\bdoc2vec\b"), "Doc2Vec"),
2232
+ (re.compile(r"\bfasttext\b"), "fastText"),
2233
  (re.compile(r"\bspecter\b"), "SPECTER"),
2234
  (re.compile(r"\bsentence[- ]?transformer"), "Sentence-Transformers"),
2235
+ (re.compile(r"\btf[- ]?idf\b"), "TF-IDF"),
2236
+ (re.compile(r"\bbm25\b"), "BM25"),
2237
+ (re.compile(r"\bbag of words\b|\bbow\b"), "Bag-of-words"),
2238
  (re.compile(r"\blda\b|\blatent dirichlet allocation\b"), "LDA topic modeling"),
2239
  (re.compile(r"\bnmf\b|\bnon[- ]?negative matrix factorization\b"), "NMF topic modeling"),
2240
+ (re.compile(r"\blsa\b|\blsi\b|\blatent semantic analysis\b"), "LSA"),
2241
  (re.compile(r"\bbertopic\b"), "BERTopic"),
2242
+ (re.compile(r"\bk[- ]?means\b"), "K-means clustering"),
2243
+ (re.compile(r"\bhierarchical clustering\b"), "Hierarchical clustering"),
2244
+ (re.compile(r"\bdbscan\b"), "DBSCAN"),
2245
+ (re.compile(r"\bhdbscan\b"), "HDBSCAN"),
2246
+ (re.compile(r"\bgmm\b|\bgaussian mixture\b"), "Gaussian mixture model"),
2247
+ (re.compile(r"\bpca\b|\bprincipal component analysis\b"), "PCA"),
2248
+ (re.compile(r"\bsvd\b|\bsingular value decomposition\b"), "SVD"),
2249
+ (re.compile(r"\btsne\b|\bt-sne\b"), "t-SNE"),
2250
+ (re.compile(r"\bumap\b"), "UMAP"),
2251
  (re.compile(r"\bner\b|\bnamed entity recognition\b"), "Named entity recognition"),
2252
  (re.compile(r"\bsentiment\b"), "Sentiment analysis"),
2253
  (re.compile(r"\brandom forest\b"), "Random Forest"),
2254
  (re.compile(r"\bdecision tree\b"), "Decision Tree"),
2255
  (re.compile(r"\bgradient boosting\b|\bxgboost\b|\blightgbm\b|\bcatboost\b"), "Gradient boosting"),
2256
  (re.compile(r"\bsvm\b|\bsupport vector machine\b"), "SVM"),
2257
+ (re.compile(r"\bknn\b|\bk[- ]?nearest neighbor\b"), "KNN"),
2258
+ (re.compile(r"\bnaive bayes\b"), "Naive Bayes"),
2259
  (re.compile(r"\bneural network\b|\bdeep learning\b|\bmlp\b"), "Neural networks"),
2260
+ (re.compile(r"\bcnn\b|\bconvolutional neural network\b"), "CNN"),
2261
+ (re.compile(r"\brnn\b|\brecurrent neural network\b"), "RNN"),
2262
+ (re.compile(r"\blstm\b"), "LSTM"),
2263
+ (re.compile(r"\bgru\b"), "GRU"),
2264
+ (re.compile(r"\bautoencoder\b"), "Autoencoder"),
2265
  (re.compile(r"\btransformer\b"), "Transformers"),
2266
  (re.compile(r"\bfine[- ]?tuning\b"), "Model fine-tuning"),
2267
  (re.compile(r"\bpls[- ]?sem\b|\bpartial least squares\b"), "PLS-SEM"),
 
2274
  (re.compile(r"\bmoderation\b"), "Moderation analysis"),
2275
  (re.compile(r"\bchi[- ]?square\b|\bchi square\b"), "Chi-square test"),
2276
  (re.compile(r"\banova\b"), "ANOVA"),
2277
+ (re.compile(r"\bmanova\b"), "MANOVA"),
2278
+ (re.compile(r"\bancova\b"), "ANCOVA"),
2279
+ (re.compile(r"\bmancova\b"), "MANCOVA"),
2280
  (re.compile(r"\bt[- ]?test\b"), "t-test"),
2281
+ (re.compile(r"\bwilcoxon\b"), "Wilcoxon test"),
2282
+ (re.compile(r"\bkruskal[- ]?wallis\b"), "Kruskal-Wallis test"),
2283
  (re.compile(r"\bfactor analysis\b"), "Factor analysis"),
2284
  (re.compile(r"\btime[- ]?series\b"), "Time-series analysis"),
2285
+ (re.compile(r"\barima\b"), "ARIMA"),
2286
+ (re.compile(r"\bsarima\b"), "SARIMA"),
2287
+ (re.compile(r"\bvar\b|\bvector autoregression\b"), "VAR"),
2288
+ (re.compile(r"\bprophet\b"), "Prophet"),
2289
+ (re.compile(r"\bpanel regression\b|\bpanel data\b"), "Panel regression"),
2290
+ (re.compile(r"\bfixed effects\b"), "Fixed-effects regression"),
2291
+ (re.compile(r"\brandom effects\b"), "Random-effects regression"),
2292
+ (re.compile(r"\bmultilevel\b|\bhierarchical linear model\b|\bhlm\b|\bmixed effects\b"), "Multilevel / mixed-effects regression"),
2293
+ (re.compile(r"\bglm\b|\bgeneralized linear model\b"), "Generalized linear model"),
2294
+ (re.compile(r"\bgls\b|\bgeneralized least squares\b"), "Generalized least squares"),
2295
+ (re.compile(r"\bgee\b|\bgeneralized estimating equation\b"), "GEE"),
2296
+ (re.compile(r"\bgmm\b|\bgeneralized method of moments\b"), "GMM"),
2297
+ (re.compile(r"\b2sls\b|\btwo[- ]?stage least squares\b"), "2SLS"),
2298
+ (re.compile(r"\b3sls\b|\bthree[- ]?stage least squares\b"), "3SLS"),
2299
+ (re.compile(r"\binstrumental variable\b|\biv\b"), "Instrumental variables"),
2300
+ (re.compile(r"\btobit\b"), "Tobit regression"),
2301
+ (re.compile(r"\bheckman\b"), "Heckman selection model"),
2302
+ (re.compile(r"\bpoisson\b"), "Poisson regression"),
2303
+ (re.compile(r"\bnegative binomial\b"), "Negative binomial regression"),
2304
+ (re.compile(r"\bprobit\b"), "Probit regression"),
2305
+ (re.compile(r"\bsurvival analysis\b|\bcox\b|\bhazard model\b|\bkaplan[- ]?meier\b"), "Survival analysis"),
2306
+ (re.compile(r"\blatent class analysis\b|\blca\b"), "Latent class analysis"),
2307
+ (re.compile(r"\blatent profile analysis\b|\blpa\b"), "Latent profile analysis"),
2308
  (re.compile(r"\blogistic regression\b"), "Logistic regression"),
2309
  (re.compile(r"\bols\b|\borderinary least squares\b|\blinear regression\b|\bmultiple regression\b"), "Linear regression (OLS)"),
2310
+ (re.compile(r"\bridge regression\b|\bridge\b"), "Ridge regression"),
2311
+ (re.compile(r"\blasso\b"), "LASSO regression"),
2312
+ (re.compile(r"\belastic net\b"), "Elastic Net regression"),
2313
  (re.compile(r"\bregression\b"), "Regression"),
2314
  (re.compile(r"\bcentrality\b"), "Network centrality"),
2315
  (re.compile(r"\bcommunity detection\b|\blouvain\b|\bleiden\b"), "Community detection"),
2316
  (re.compile(r"\bergm\b|\bexponential random graph\b"), "ERGM"),
2317
  (re.compile(r"\blink prediction\b"), "Link prediction"),
2318
+ (re.compile(r"\bpagerank\b|\bpage rank\b"), "PageRank"),
2319
+ (re.compile(r"\bgraph neural network\b|\bgnn\b"), "Graph neural networks"),
2320
+ (re.compile(r"\bhidden markov\b|\bhmm\b"), "Hidden Markov Model"),
2321
+ (re.compile(r"\bmarkov chain\b|\bmarkov model\b"), "Markov models"),
2322
+ (re.compile(r"\bkalman filter\b"), "Kalman filter"),
2323
+ (re.compile(r"\bstate[- ]?space\b"), "State-space models"),
2324
+ (re.compile(r"\bhawkes\b"), "Hawkes process"),
2325
+ (re.compile(r"\brecommender\b|\bcollaborative filtering\b|\bmatrix factorization\b"), "Recommender systems"),
2326
+ (re.compile(r"\bahp\b|\banalytic hierarchy process\b"), "AHP"),
2327
+ (re.compile(r"\btopsis\b"), "TOPSIS"),
2328
+ (re.compile(r"\bvikor\b"), "VIKOR"),
2329
+ (re.compile(r"\bpromethee\b"), "PROMETHEE"),
2330
+ (re.compile(r"\bdematel\b"), "DEMATEL"),
2331
+ (re.compile(r"\bdea\b|\bdata envelopment analysis\b"), "DEA"),
2332
+ (re.compile(r"\bsfa\b|\bstochastic frontier\b"), "SFA"),
2333
  (re.compile(r"\bagent[- ]?based\b"), "Agent-based simulation"),
2334
  (re.compile(r"\bmonte carlo\b"), "Monte Carlo simulation"),
2335
  (re.compile(r"\bbayesian optimization\b"), "Bayesian optimization"),
2336
+ (re.compile(r"\blinear programming\b|\binteger programming\b|\bmixed integer\b"), "Mathematical optimization"),
2337
+ (re.compile(r"\bgenetic algorithm\b"), "Genetic algorithms"),
2338
+ (re.compile(r"\bsimulated annealing\b"), "Simulated annealing"),
2339
  ]
2340
 
2341
  def _canonicalize_technique(name: str) -> tuple[str, str]:
 
2348
  return display, display.lower()
2349
 
2350
  category_patterns = [
2351
+ (re.compile(r"\b(bert|roberta|xlm roberta|gpt|t5|transformer|fine[- ]?tuning)\b"), "Transformers"),
2352
+ (re.compile(r"\b(word2vec|glove|doc2vec|fasttext|specter|sentence[- ]?transformer|embedding|tf[- ]?idf|bm25|bag of words|bow)\b"), "Embeddings / Representation"),
2353
+ (re.compile(r"\b(topic modeling|lda|nmf|bertopic|lsa|lsi)\b"), "Topic Modeling"),
2354
+ (re.compile(r"\b(k[- ]?means|hierarchical clustering|dbscan|hdbscan|gaussian mixture|gmm|clustering)\b"), "Clustering"),
2355
+ (re.compile(r"\b(pca|svd|t-sne|tsne|umap|dimensionality reduction)\b"), "Dimensionality Reduction"),
2356
+ (re.compile(r"\b(arima|sarima|var|prophet|time[- ]?series)\b"), "Time Series / Forecasting"),
2357
+ (re.compile(r"\b(panel data|panel regression|fixed effects|random effects|multilevel|hierarchical linear model|hlm|mixed effects|glm|gls|gee|gmm|2sls|3sls|instrumental variable|tobit|heckman|poisson|negative binomial|probit|logit)\b"), "Econometric / Panel Models"),
2358
+ (re.compile(r"\b(ols|linear regression|logistic regression|ridge|lasso|elastic net|regression)\b"), "Regression"),
2359
+ (re.compile(r"\b(sem|pls[- ]?sem|cb[- ]?sem|structural equation|cfa|efa)\b"), "SEM"),
2360
+ (re.compile(r"\b(latent class analysis|latent profile analysis|latent variable|mixture model)\b"), "Latent Variable Models"),
2361
+ (re.compile(r"\b(grad(ient)? boosting|xgboost|lightgbm|catboost)\b"), "Boosting / Ensembles"),
2362
+ (re.compile(r"\b(random forest|decision tree|svm|knn|naive bayes)\b"), "Classic ML"),
2363
+ (re.compile(r"\b(neural network|deep learning|lstm|cnn|rnn|gru|mlp|autoencoder)\b"), "Deep Learning"),
2364
+ (re.compile(r"\b(ner|named entity recognition|sentiment|nlp|text mining|tokenization|stemming|lemmatization|keyword extraction)\b"), "NLP / Text Mining"),
2365
+ (re.compile(r"\b(network|centrality|community detection|louvain|leiden|ergm|link prediction|pagerank|graph neural network|gnn)\b"), "Network Analysis"),
2366
+ (re.compile(r"\b(agent[- ]?based|monte carlo|bayesian optimization|linear programming|integer programming|genetic algorithm|simulated annealing)\b"), "Simulation / Optimization"),
2367
+ (re.compile(r"\b(survival|cox|hazard|kaplan[- ]?meier)\b"), "Survival / Event History"),
2368
+ (re.compile(r"\b(bayesian|mcmc|gibbs|variational)\b"), "Bayesian Methods"),
2369
+ (re.compile(r"\b(anova|manova|ancova|mancova|t[- ]?test|chi[- ]?square|factor analysis|glmm|irt|mediation|moderation|wilcoxon|kruskal[- ]?wallis)\b"), "Statistical Tests / Models"),
2370
+ (re.compile(r"\b(difference[- ]?in[- ]?differences|did|regression discontinuity|rdd|instrumental variable|iv|propensity score|matching)\b"), "Causal Inference"),
2371
+ (re.compile(r"\b(recommender|collaborative filtering|matrix factorization)\b"), "Recommender Systems"),
2372
+ (re.compile(r"\b(hidden markov|hmm|markov|kalman|state[- ]?space|hawkes)\b"), "Sequence / Stochastic Processes"),
2373
+ (re.compile(r"\b(ahp|analytic hierarchy process|topsis|vikor|promethee|dematel)\b"), "Decision Analysis / MCDA"),
2374
+ (re.compile(r"\b(dea|data envelopment analysis|stochastic frontier|sfa|frontier analysis)\b"), "Efficiency / Frontier Analysis"),
2375
  ]
2376
 
2377
+ def _categorize_technique(*names: str) -> str:
2378
+ for name in names:
2379
+ if not name:
2380
+ continue
2381
+ key = _normalize_technique_key(name)
2382
+ for pattern, category in category_patterns:
2383
+ if pattern.search(key):
2384
+ return category
2385
+ fallback_keywords = [
2386
+ ("Classic ML", ["classifier", "classification", "predictive model", "prediction", "supervised"]),
2387
+ ("Clustering", ["cluster", "clustering"]),
2388
+ ("Topic Modeling", ["topic", "semantic"]),
2389
+ ("Embeddings / Representation", ["embedding", "vector", "tf idf", "bow", "bag of words"]),
2390
+ ("Regression", ["regression", "logit", "probit", "panel", "fixed effects", "random effects", "glm", "gls", "gee", "gmm"]),
2391
+ ("SEM", ["sem", "structural equation", "factor", "latent"]),
2392
+ ("Bayesian Methods", ["bayesian", "mcmc", "gibbs", "prior", "posterior"]),
2393
+ ("Time Series / Forecasting", ["time series", "forecast", "arima", "sarima", "var", "prophet"]),
2394
+ ("NLP / Text Mining", ["nlp", "text", "token", "lemma", "stem", "language"]),
2395
+ ("Network Analysis", ["network", "graph", "node", "edge"]),
2396
+ ("Simulation / Optimization", ["simulation", "optimi", "heuristic", "metaheuristic", "monte carlo", "agent-based"]),
2397
+ ]
2398
+ for category, keywords in fallback_keywords:
2399
+ if any(k in key for k in keywords):
2400
+ return category
2401
+ if any(token in key for token in ["model", "analysis", "estimation", "test"]):
2402
+ return "Statistical Tests / Models"
2403
  return "Other"
2404
 
2405
  category_map: dict[str, dict[str, object]] = {}
 
2420
  algorithm, _ = _canonicalize_technique(technique)
2421
  if not algorithm:
2422
  continue
2423
+ category = _categorize_technique(technique, algorithm)
2424
  key = category.lower()
2425
  if key not in category_map:
2426
  category_map[key] = {
 
2470
  generate_comparison_csv,
2471
  export_narrative,
2472
  extract_methods_from_pdfs,
2473
+ ]