Spaces:
Sleeping
Sleeping
Commit ·
6f953dc
1
Parent(s): a89f9ba
Remove tracked venv and refine gitignore
Browse files- .DS_Store +0 -0
- .gitignore +22 -0
- .vscode/settings.json +3 -0
- app.py +0 -0
- artifacts/.DS_Store +0 -0
- requirements.txt +42 -0
- src/components/clustering_engine.py +0 -0
- src/components/data_cleaning.py +0 -0
- src/components/data_loader.py +0 -0
- src/components/embedding_generator.py +0 -0
- src/components/query_engine.py +0 -0
- src/components/summarization_engine.py +0 -0
- src/components/visualization.py +0 -0
- src/config/cluster_config.json +0 -0
- src/config/config.yaml +0 -0
- src/config/model_config.json +0 -0
- src/pipelines/build_embeddings_pipeline.py +0 -0
- src/pipelines/clustering_pipeline.py +0 -0
- src/pipelines/full_run_pipeline.py +0 -0
- src/pipelines/query_pipeline.py +0 -0
- src/pipelines/summarization_pipeline.py +0 -0
- src/utils/__pycache__/exception.cpython-39.pyc +0 -0
- src/utils/__pycache__/file_utils.cpython-39.pyc +0 -0
- src/utils/__pycache__/logger.cpython-39.pyc +0 -0
- src/utils/__pycache__/plot_utils.cpython-39.pyc +0 -0
- src/utils/__pycache__/text_utils.cpython-39.pyc +0 -0
- src/utils/exception.py +3 -0
- src/utils/file_utils.py +22 -0
- src/utils/logger.py +12 -0
- src/utils/model_utils.py +45 -0
- src/utils/plot_utils.py +12 -0
- src/utils/text_utils.py +11 -0
- static/styles.css +0 -0
- temp.jsonl +1 -0
- templates/index.html +0 -0
- templates/results.html +0 -0
.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
venv/
|
| 3 |
+
.env/
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.pyc
|
| 8 |
+
*.pyo
|
| 9 |
+
*.pyd
|
| 10 |
+
.ipynb_checkpoints/
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
artifacts/*
|
| 14 |
+
!artifacts/raw_data/
|
| 15 |
+
!artifacts/cleaned_data/
|
| 16 |
+
!artifacts/embeddings/
|
| 17 |
+
!artifacts/clustering/
|
| 18 |
+
!artifacts/summaries/
|
| 19 |
+
!artifacts/models/
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
.DS_Store
|
.vscode/settings.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"git.ignoreLimitWarning": true
|
| 3 |
+
}
|
app.py
ADDED
|
File without changes
|
artifacts/.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
numpy>=1.23
|
| 3 |
+
pandas>=1.5
|
| 4 |
+
tqdm
|
| 5 |
+
scikit-learn>=1.1
|
| 6 |
+
scipy
|
| 7 |
+
pyyaml
|
| 8 |
+
pydantic
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
transformers>=4.30
|
| 12 |
+
sentence-transformers>=2.2
|
| 13 |
+
torch>=1.13
|
| 14 |
+
accelerate
|
| 15 |
+
tokenizers
|
| 16 |
+
safetensors
|
| 17 |
+
|
| 18 |
+
hdbscan
|
| 19 |
+
faiss-cpu
|
| 20 |
+
umap-learn
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
streamlit
|
| 24 |
+
flask
|
| 25 |
+
gunicorn
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
python-dotenv
|
| 29 |
+
nltk
|
| 30 |
+
regex
|
| 31 |
+
joblib
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
matplotlib
|
| 35 |
+
wordcloud
|
| 36 |
+
plotly
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
kaggle
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
typing_extensions
|
src/components/clustering_engine.py
ADDED
|
File without changes
|
src/components/data_cleaning.py
ADDED
|
File without changes
|
src/components/data_loader.py
ADDED
|
File without changes
|
src/components/embedding_generator.py
ADDED
|
File without changes
|
src/components/query_engine.py
ADDED
|
File without changes
|
src/components/summarization_engine.py
ADDED
|
File without changes
|
src/components/visualization.py
ADDED
|
File without changes
|
src/config/cluster_config.json
ADDED
|
File without changes
|
src/config/config.yaml
ADDED
|
File without changes
|
src/config/model_config.json
ADDED
|
File without changes
|
src/pipelines/build_embeddings_pipeline.py
ADDED
|
File without changes
|
src/pipelines/clustering_pipeline.py
ADDED
|
File without changes
|
src/pipelines/full_run_pipeline.py
ADDED
|
File without changes
|
src/pipelines/query_pipeline.py
ADDED
|
File without changes
|
src/pipelines/summarization_pipeline.py
ADDED
|
File without changes
|
src/utils/__pycache__/exception.cpython-39.pyc
ADDED
|
Binary file (510 Bytes). View file
|
|
|
src/utils/__pycache__/file_utils.cpython-39.pyc
ADDED
|
Binary file (1.01 kB). View file
|
|
|
src/utils/__pycache__/logger.cpython-39.pyc
ADDED
|
Binary file (581 Bytes). View file
|
|
|
src/utils/__pycache__/plot_utils.cpython-39.pyc
ADDED
|
Binary file (680 Bytes). View file
|
|
|
src/utils/__pycache__/text_utils.cpython-39.pyc
ADDED
|
Binary file (590 Bytes). View file
|
|
|
src/utils/exception.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class CustomException(Exception):
|
| 2 |
+
def __init__(self, message):
|
| 3 |
+
super().__init__(message)
|
src/utils/file_utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import csv
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
def save_json_lines(records, path):
|
| 7 |
+
p = Path(path)
|
| 8 |
+
p.parent.mkdir(parents=True, exist_ok=True)
|
| 9 |
+
with open(p, 'w', encoding='utf-8') as f:
|
| 10 |
+
for r in records:
|
| 11 |
+
f.write(json.dumps(r) + "\n")
|
| 12 |
+
|
| 13 |
+
def load_json_lines(path):
|
| 14 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 15 |
+
for line in f:
|
| 16 |
+
if line.strip():
|
| 17 |
+
yield json.loads(line)
|
| 18 |
+
|
| 19 |
+
def save_csv(df, path):
|
| 20 |
+
p = Path(path)
|
| 21 |
+
p.parent.mkdir(parents=True, exist_ok=True)
|
| 22 |
+
df.to_csv(p, index=False)
|
src/utils/logger.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
def get_logger(name=__name__, level=logging.INFO):
|
| 5 |
+
logger = logging.getLogger(name)
|
| 6 |
+
if not logger.handlers:
|
| 7 |
+
handler = logging.StreamHandler(sys.stdout)
|
| 8 |
+
fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 9 |
+
handler.setFormatter(logging.Formatter(fmt))
|
| 10 |
+
logger.addHandler(handler)
|
| 11 |
+
logger.setLevel(level)
|
| 12 |
+
return logger
|
src/utils/model_utils.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import faiss
|
| 5 |
+
import torch
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 8 |
+
|
| 9 |
+
def load_sentence_transformer(path_or_name):
|
| 10 |
+
if os.path.exists(path_or_name):
|
| 11 |
+
model = SentenceTransformer(path_or_name)
|
| 12 |
+
else:
|
| 13 |
+
model = SentenceTransformer(path_or_name)
|
| 14 |
+
return model
|
| 15 |
+
|
| 16 |
+
def load_summarizer_model(path_or_name):
|
| 17 |
+
|
| 18 |
+
tokenizer = AutoTokenizer.from_pretrained(path_or_name)
|
| 19 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(path_or_name)
|
| 20 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 21 |
+
model.to(device)
|
| 22 |
+
return model, tokenizer
|
| 23 |
+
|
| 24 |
+
def load_tokenizer(path_or_name):
|
| 25 |
+
return AutoTokenizer.from_pretrained(path_or_name)
|
| 26 |
+
|
| 27 |
+
def load_faiss_index(path):
|
| 28 |
+
|
| 29 |
+
if not os.path.exists(path):
|
| 30 |
+
raise FileNotFoundError(f"FAISS index not found: {path}")
|
| 31 |
+
index = faiss.read_index(path)
|
| 32 |
+
# load metadata if available
|
| 33 |
+
meta_path = os.path.join(os.path.dirname(path), "meta.jsonl")
|
| 34 |
+
meta = []
|
| 35 |
+
if os.path.exists(meta_path):
|
| 36 |
+
import json
|
| 37 |
+
with open(meta_path) as f:
|
| 38 |
+
for line in f:
|
| 39 |
+
if line.strip():
|
| 40 |
+
meta.append(json.loads(line))
|
| 41 |
+
else:
|
| 42 |
+
np_meta = os.path.join(os.path.dirname(path), "meta.npy")
|
| 43 |
+
if os.path.exists(np_meta):
|
| 44 |
+
meta = np.load(np_meta, allow_pickle=True).tolist()
|
| 45 |
+
return index, meta
|
src/utils/plot_utils.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import matplotlib.pyplot as plt
|
| 2 |
+
from wordcloud import WordCloud
|
| 3 |
+
from collections import Counter
|
| 4 |
+
|
| 5 |
+
def plot_wordcloud(texts, max_words=100):
|
| 6 |
+
text = " ".join(texts)
|
| 7 |
+
wc = WordCloud(width=800, height=400, background_color="white", max_words=max_words)
|
| 8 |
+
wc.generate(text)
|
| 9 |
+
plt.figure(figsize=(12,6))
|
| 10 |
+
plt.imshow(wc, interpolation='bilinear')
|
| 11 |
+
plt.axis('off')
|
| 12 |
+
return plt
|
src/utils/text_utils.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from nltk.stem import PorterStemmer
|
| 2 |
+
from nltk.tokenize import word_tokenize
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
ps = PorterStemmer()
|
| 6 |
+
|
| 7 |
+
def stem_text(text):
|
| 8 |
+
if not text:
|
| 9 |
+
return ""
|
| 10 |
+
tokens = word_tokenize(text)
|
| 11 |
+
return " ".join(ps.stem(t) for t in tokens)
|
static/styles.css
ADDED
|
File without changes
|
temp.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"a": 1}
|
templates/index.html
ADDED
|
File without changes
|
templates/results.html
ADDED
|
File without changes
|