Spaces:

Anshrathore01
/

opinion-summarizer

Sleeping

App Files Files Community

Anshrathore01 commited on Nov 17, 2025

Commit

6f953dc

1 Parent(s): a89f9ba

Remove tracked venv and refine gitignore

Browse files

Files changed (36) hide show

.DS_Store +0 -0
.gitignore +22 -0
.vscode/settings.json +3 -0
app.py +0 -0
artifacts/.DS_Store +0 -0
requirements.txt +42 -0
src/components/clustering_engine.py +0 -0
src/components/data_cleaning.py +0 -0
src/components/data_loader.py +0 -0
src/components/embedding_generator.py +0 -0
src/components/query_engine.py +0 -0
src/components/summarization_engine.py +0 -0
src/components/visualization.py +0 -0
src/config/cluster_config.json +0 -0
src/config/config.yaml +0 -0
src/config/model_config.json +0 -0
src/pipelines/build_embeddings_pipeline.py +0 -0
src/pipelines/clustering_pipeline.py +0 -0
src/pipelines/full_run_pipeline.py +0 -0
src/pipelines/query_pipeline.py +0 -0
src/pipelines/summarization_pipeline.py +0 -0
src/utils/__pycache__/exception.cpython-39.pyc +0 -0
src/utils/__pycache__/file_utils.cpython-39.pyc +0 -0
src/utils/__pycache__/logger.cpython-39.pyc +0 -0
src/utils/__pycache__/plot_utils.cpython-39.pyc +0 -0
src/utils/__pycache__/text_utils.cpython-39.pyc +0 -0
src/utils/exception.py +3 -0
src/utils/file_utils.py +22 -0
src/utils/logger.py +12 -0
src/utils/model_utils.py +45 -0
src/utils/plot_utils.py +12 -0
src/utils/text_utils.py +11 -0
static/styles.css +0 -0
temp.jsonl +1 -0
templates/index.html +0 -0
templates/results.html +0 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,22 @@

+venv/
+.env/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.ipynb_checkpoints/
+artifacts/*
+!artifacts/raw_data/
+!artifacts/cleaned_data/
+!artifacts/embeddings/
+!artifacts/clustering/
+!artifacts/summaries/
+!artifacts/models/
+.DS_Store

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "git.ignoreLimitWarning": true
+}

app.py ADDED Viewed

File without changes

artifacts/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+numpy>=1.23
+pandas>=1.5
+tqdm
+scikit-learn>=1.1
+scipy
+pyyaml
+pydantic
+transformers>=4.30
+sentence-transformers>=2.2
+torch>=1.13
+accelerate
+tokenizers
+safetensors
+hdbscan
+faiss-cpu
+umap-learn
+streamlit
+flask
+gunicorn
+python-dotenv
+nltk
+regex
+joblib
+matplotlib
+wordcloud
+plotly
+kaggle
+typing_extensions

src/components/clustering_engine.py ADDED Viewed

File without changes

src/components/data_cleaning.py ADDED Viewed

File without changes

src/components/data_loader.py ADDED Viewed

File without changes

src/components/embedding_generator.py ADDED Viewed

File without changes

src/components/query_engine.py ADDED Viewed

File without changes

src/components/summarization_engine.py ADDED Viewed

File without changes

src/components/visualization.py ADDED Viewed

File without changes

src/config/cluster_config.json ADDED Viewed

File without changes

src/config/config.yaml ADDED Viewed

File without changes

src/config/model_config.json ADDED Viewed

File without changes

src/pipelines/build_embeddings_pipeline.py ADDED Viewed

File without changes

src/pipelines/clustering_pipeline.py ADDED Viewed

File without changes

src/pipelines/full_run_pipeline.py ADDED Viewed

File without changes

src/pipelines/query_pipeline.py ADDED Viewed

File without changes

src/pipelines/summarization_pipeline.py ADDED Viewed

File without changes

src/utils/__pycache__/exception.cpython-39.pyc ADDED Viewed

Binary file (510 Bytes). View file

src/utils/__pycache__/file_utils.cpython-39.pyc ADDED Viewed

Binary file (1.01 kB). View file

src/utils/__pycache__/logger.cpython-39.pyc ADDED Viewed

Binary file (581 Bytes). View file

src/utils/__pycache__/plot_utils.cpython-39.pyc ADDED Viewed

Binary file (680 Bytes). View file

src/utils/__pycache__/text_utils.cpython-39.pyc ADDED Viewed

Binary file (590 Bytes). View file

src/utils/exception.py ADDED Viewed

	@@ -0,0 +1,3 @@

+class CustomException(Exception):
+    def __init__(self, message):
+        super().__init__(message)

src/utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+import csv
+from pathlib import Path
+import pandas as pd
+def save_json_lines(records, path):
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    with open(p, 'w', encoding='utf-8') as f:
+        for r in records:
+            f.write(json.dumps(r) + "\n")
+def load_json_lines(path):
+    with open(path, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                yield json.loads(line)
+def save_csv(df, path):
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(p, index=False)

src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import logging
+import sys
+def get_logger(name=__name__, level=logging.INFO):
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        handler = logging.StreamHandler(sys.stdout)
+        fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        handler.setFormatter(logging.Formatter(fmt))
+        logger.addHandler(handler)
+    logger.setLevel(level)
+    return logger

src/utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import numpy as np
+import faiss
+import torch
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+def load_sentence_transformer(path_or_name):
+    if os.path.exists(path_or_name):
+        model = SentenceTransformer(path_or_name)
+    else:
+        model = SentenceTransformer(path_or_name)
+    return model
+def load_summarizer_model(path_or_name):
+    tokenizer = AutoTokenizer.from_pretrained(path_or_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(path_or_name)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    return model, tokenizer
+def load_tokenizer(path_or_name):
+    return AutoTokenizer.from_pretrained(path_or_name)
+def load_faiss_index(path):
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"FAISS index not found: {path}")
+    index = faiss.read_index(path)
+    # load metadata if available
+    meta_path = os.path.join(os.path.dirname(path), "meta.jsonl")
+    meta = []
+    if os.path.exists(meta_path):
+        import json
+        with open(meta_path) as f:
+            for line in f:
+                if line.strip():
+                    meta.append(json.loads(line))
+    else:
+        np_meta = os.path.join(os.path.dirname(path), "meta.npy")
+        if os.path.exists(np_meta):
+            meta = np.load(np_meta, allow_pickle=True).tolist()
+    return index, meta

src/utils/plot_utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+from collections import Counter
+def plot_wordcloud(texts, max_words=100):
+    text = " ".join(texts)
+    wc = WordCloud(width=800, height=400, background_color="white", max_words=max_words)
+    wc.generate(text)
+    plt.figure(figsize=(12,6))
+    plt.imshow(wc, interpolation='bilinear')
+    plt.axis('off')
+    return plt

src/utils/text_utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+import re
+ps = PorterStemmer()
+def stem_text(text):
+    if not text:
+        return ""
+    tokens = word_tokenize(text)
+    return " ".join(ps.stem(t) for t in tokens)

static/styles.css ADDED Viewed

File without changes

temp.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"a": 1}

templates/index.html ADDED Viewed

File without changes

templates/results.html ADDED Viewed

File without changes