Anshrathore01 commited on
Commit
6f953dc
·
1 Parent(s): a89f9ba

Remove tracked venv and refine gitignore

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ venv/
3
+ .env/
4
+
5
+
6
+ __pycache__/
7
+ *.pyc
8
+ *.pyo
9
+ *.pyd
10
+ .ipynb_checkpoints/
11
+
12
+
13
+ artifacts/*
14
+ !artifacts/raw_data/
15
+ !artifacts/cleaned_data/
16
+ !artifacts/embeddings/
17
+ !artifacts/clustering/
18
+ !artifacts/summaries/
19
+ !artifacts/models/
20
+
21
+
22
+ .DS_Store
.vscode/settings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "git.ignoreLimitWarning": true
3
+ }
app.py ADDED
File without changes
artifacts/.DS_Store ADDED
Binary file (8.2 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ numpy>=1.23
3
+ pandas>=1.5
4
+ tqdm
5
+ scikit-learn>=1.1
6
+ scipy
7
+ pyyaml
8
+ pydantic
9
+
10
+
11
+ transformers>=4.30
12
+ sentence-transformers>=2.2
13
+ torch>=1.13
14
+ accelerate
15
+ tokenizers
16
+ safetensors
17
+
18
+ hdbscan
19
+ faiss-cpu
20
+ umap-learn
21
+
22
+
23
+ streamlit
24
+ flask
25
+ gunicorn
26
+
27
+
28
+ python-dotenv
29
+ nltk
30
+ regex
31
+ joblib
32
+
33
+
34
+ matplotlib
35
+ wordcloud
36
+ plotly
37
+
38
+
39
+ kaggle
40
+
41
+
42
+ typing_extensions
src/components/clustering_engine.py ADDED
File without changes
src/components/data_cleaning.py ADDED
File without changes
src/components/data_loader.py ADDED
File without changes
src/components/embedding_generator.py ADDED
File without changes
src/components/query_engine.py ADDED
File without changes
src/components/summarization_engine.py ADDED
File without changes
src/components/visualization.py ADDED
File without changes
src/config/cluster_config.json ADDED
File without changes
src/config/config.yaml ADDED
File without changes
src/config/model_config.json ADDED
File without changes
src/pipelines/build_embeddings_pipeline.py ADDED
File without changes
src/pipelines/clustering_pipeline.py ADDED
File without changes
src/pipelines/full_run_pipeline.py ADDED
File without changes
src/pipelines/query_pipeline.py ADDED
File without changes
src/pipelines/summarization_pipeline.py ADDED
File without changes
src/utils/__pycache__/exception.cpython-39.pyc ADDED
Binary file (510 Bytes). View file
 
src/utils/__pycache__/file_utils.cpython-39.pyc ADDED
Binary file (1.01 kB). View file
 
src/utils/__pycache__/logger.cpython-39.pyc ADDED
Binary file (581 Bytes). View file
 
src/utils/__pycache__/plot_utils.cpython-39.pyc ADDED
Binary file (680 Bytes). View file
 
src/utils/__pycache__/text_utils.cpython-39.pyc ADDED
Binary file (590 Bytes). View file
 
src/utils/exception.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ class CustomException(Exception):
2
+ def __init__(self, message):
3
+ super().__init__(message)
src/utils/file_utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import csv
3
+ from pathlib import Path
4
+ import pandas as pd
5
+
6
+ def save_json_lines(records, path):
7
+ p = Path(path)
8
+ p.parent.mkdir(parents=True, exist_ok=True)
9
+ with open(p, 'w', encoding='utf-8') as f:
10
+ for r in records:
11
+ f.write(json.dumps(r) + "\n")
12
+
13
+ def load_json_lines(path):
14
+ with open(path, 'r', encoding='utf-8') as f:
15
+ for line in f:
16
+ if line.strip():
17
+ yield json.loads(line)
18
+
19
+ def save_csv(df, path):
20
+ p = Path(path)
21
+ p.parent.mkdir(parents=True, exist_ok=True)
22
+ df.to_csv(p, index=False)
src/utils/logger.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+
4
+ def get_logger(name=__name__, level=logging.INFO):
5
+ logger = logging.getLogger(name)
6
+ if not logger.handlers:
7
+ handler = logging.StreamHandler(sys.stdout)
8
+ fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
9
+ handler.setFormatter(logging.Formatter(fmt))
10
+ logger.addHandler(handler)
11
+ logger.setLevel(level)
12
+ return logger
src/utils/model_utils.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import numpy as np
4
+ import faiss
5
+ import torch
6
+ from sentence_transformers import SentenceTransformer
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
+
9
+ def load_sentence_transformer(path_or_name):
10
+ if os.path.exists(path_or_name):
11
+ model = SentenceTransformer(path_or_name)
12
+ else:
13
+ model = SentenceTransformer(path_or_name)
14
+ return model
15
+
16
+ def load_summarizer_model(path_or_name):
17
+
18
+ tokenizer = AutoTokenizer.from_pretrained(path_or_name)
19
+ model = AutoModelForSeq2SeqLM.from_pretrained(path_or_name)
20
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+ model.to(device)
22
+ return model, tokenizer
23
+
24
+ def load_tokenizer(path_or_name):
25
+ return AutoTokenizer.from_pretrained(path_or_name)
26
+
27
+ def load_faiss_index(path):
28
+
29
+ if not os.path.exists(path):
30
+ raise FileNotFoundError(f"FAISS index not found: {path}")
31
+ index = faiss.read_index(path)
32
+ # load metadata if available
33
+ meta_path = os.path.join(os.path.dirname(path), "meta.jsonl")
34
+ meta = []
35
+ if os.path.exists(meta_path):
36
+ import json
37
+ with open(meta_path) as f:
38
+ for line in f:
39
+ if line.strip():
40
+ meta.append(json.loads(line))
41
+ else:
42
+ np_meta = os.path.join(os.path.dirname(path), "meta.npy")
43
+ if os.path.exists(np_meta):
44
+ meta = np.load(np_meta, allow_pickle=True).tolist()
45
+ return index, meta
src/utils/plot_utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ from wordcloud import WordCloud
3
+ from collections import Counter
4
+
5
+ def plot_wordcloud(texts, max_words=100):
6
+ text = " ".join(texts)
7
+ wc = WordCloud(width=800, height=400, background_color="white", max_words=max_words)
8
+ wc.generate(text)
9
+ plt.figure(figsize=(12,6))
10
+ plt.imshow(wc, interpolation='bilinear')
11
+ plt.axis('off')
12
+ return plt
src/utils/text_utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.stem import PorterStemmer
2
+ from nltk.tokenize import word_tokenize
3
+ import re
4
+
5
+ ps = PorterStemmer()
6
+
7
+ def stem_text(text):
8
+ if not text:
9
+ return ""
10
+ tokens = word_tokenize(text)
11
+ return " ".join(ps.stem(t) for t in tokens)
static/styles.css ADDED
File without changes
temp.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"a": 1}
templates/index.html ADDED
File without changes
templates/results.html ADDED
File without changes