S-Dreamer commited on
Commit
8517cb1
·
verified ·
1 Parent(s): e4c1288

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -38
app.py CHANGED
@@ -1,50 +1,73 @@
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  from datasets import load_dataset
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
- import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # Load datasets
8
- nsfw_datasets = [
9
- load_dataset("aifeifei798/DPO_Pairs-Roleplay-NSFW"),
10
- load_dataset("Maxx0/sexting-nsfw-adultconten"),
11
- load_dataset("QuietImpostor/Claude-3-Opus-Claude-3.5-Sonnnet-9k"),
12
- load_dataset("HuggingFaceTB/everyday-conversations-llama3.1-2k"),
13
- load_dataset("Chadgpt-fam/sexting_dataset")
14
- ]
15
-
16
- # Prepare all texts from datasets
17
- all_texts = []
18
- for dataset in nsfw_datasets:
19
- for split in dataset.keys():
20
- if 'text' in dataset[split].features:
21
- all_texts.extend(dataset[split]['text'])
22
- elif 'content' in dataset[split].features:
23
- all_texts.extend(dataset[split]['content'])
24
-
25
- # Create TF-IDF vectorizer
26
- vectorizer = TfidfVectorizer()
27
- tfidf_matrix = vectorizer.fit_transform(all_texts)
28
-
29
- def find_best_description(input_text):
30
- input_vector = vectorizer.transform([input_text])
31
- similarities = cosine_similarity(input_vector, tfidf_matrix)
32
- most_similar_index = np.argmax(similarities)
33
- return all_texts[most_similar_index]
34
-
35
- def generate_text(input_text):
36
- return find_best_description(input_text)
37
-
38
- # Create Gradio interface
39
  iface = gr.Interface(
40
  fn=generate_text,
41
  inputs=gr.Textbox(label="Enter text to describe"),
42
- outputs="text",
43
  title="NSFW Text Descriptor",
44
- description="Enter text to find the best description from NSFW datasets.",
45
- allow_flagging="never"
46
  )
47
 
48
- # Launch the app
49
  if __name__ == "__main__":
50
- iface.launch()
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ NSFW Text Descriptor using TF-IDF and Cosine Similarity
5
+ Optimized for modularity, memory efficiency, and Gradio integration.
6
+ """
7
+
8
  import gradio as gr
9
+ import numpy as np
10
  from datasets import load_dataset
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
+ from itertools import chain
14
+ from typing import List
15
+
16
+
17
+ class NSFWTextMatcher:
18
+ def __init__(self):
19
+ self.dataset_sources = [
20
+ "aifeifei798/DPO_Pairs-Roleplay-NSFW",
21
+ "Maxx0/sexting-nsfw-adultconten",
22
+ "QuietImpostor/Claude-3-Opus-Claude-3.5-Sonnnet-9k",
23
+ "HuggingFaceTB/everyday-conversations-llama3.1-2k",
24
+ "Chadgpt-fam/sexting_dataset"
25
+ ]
26
+ self.all_texts = self._load_all_texts()
27
+ self.vectorizer = TfidfVectorizer()
28
+ self.tfidf_matrix = self.vectorizer.fit_transform(self.all_texts)
29
+
30
+ def _load_all_texts(self) -> List[str]:
31
+ texts = []
32
+ for source in self.dataset_sources:
33
+ try:
34
+ dataset = load_dataset(source)
35
+ for split in dataset:
36
+ features = dataset[split].features
37
+ if 'text' in features:
38
+ texts.extend(dataset[split]['text'])
39
+ elif 'content' in features:
40
+ texts.extend(dataset[split]['content'])
41
+ except Exception as e:
42
+ print(f"[WARN] Failed to load dataset {source}: {e}")
43
+ return texts
44
+
45
+ def find_best_match(self, input_text: str) -> str:
46
+ input_vector = self.vectorizer.transform([input_text])
47
+ similarity_scores = cosine_similarity(input_vector, self.tfidf_matrix)
48
+ best_match_idx = np.argmax(similarity_scores)
49
+ return self.all_texts[best_match_idx]
50
+
51
+
52
+ # Instantiate the matcher once (can be made lazy if needed)
53
+ matcher = NSFWTextMatcher()
54
+
55
+
56
+ def generate_text(input_text: str) -> str:
57
+ if not input_text.strip():
58
+ return "Please enter a valid input."
59
+ return matcher.find_best_match(input_text)
60
+
61
 
62
+ # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  iface = gr.Interface(
64
  fn=generate_text,
65
  inputs=gr.Textbox(label="Enter text to describe"),
66
+ outputs=gr.Textbox(label="Best Match"),
67
  title="NSFW Text Descriptor",
68
+ description="Match your input with the most similar description from NSFW datasets using TF-IDF.",
69
+ allow_flagging="never",
70
  )
71
 
 
72
  if __name__ == "__main__":
73
+ iface.launch()