Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -159,27 +159,27 @@ class DebiasingSFTTrainer(SFTTrainer):
|
|
| 159 |
|
| 160 |
@spaces.GPU()
|
| 161 |
def _create_deduplicated_iterable_dataset(dataset, text_col, method, threshold=0.85, num_perm=128):
|
|
|
|
|
|
|
| 162 |
def gen():
|
| 163 |
if method == 'Exacta':
|
| 164 |
-
seen_texts = set()
|
| 165 |
for example in dataset:
|
| 166 |
text = example.get(text_col, "")
|
| 167 |
if text and isinstance(text, str):
|
| 168 |
-
if text not in
|
| 169 |
-
|
| 170 |
yield example
|
| 171 |
else:
|
| 172 |
yield example
|
| 173 |
elif method == 'Semántica (MinHash)':
|
| 174 |
-
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
| 175 |
for i, example in enumerate(dataset):
|
| 176 |
text = example.get(text_col, "")
|
| 177 |
if text and isinstance(text, str) and text.strip():
|
| 178 |
m = MinHash(num_perm=num_perm)
|
| 179 |
for d in text.split():
|
| 180 |
m.update(d.encode('utf8'))
|
| 181 |
-
if not
|
| 182 |
-
|
| 183 |
yield example
|
| 184 |
else:
|
| 185 |
yield example
|
|
@@ -1424,7 +1424,8 @@ def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, s
|
|
| 1424 |
login(token=hf_token)
|
| 1425 |
user = whoami()
|
| 1426 |
username = user.get("name")
|
| 1427 |
-
|
|
|
|
| 1428 |
create_repo(repo_id, repo_type="dataset", exist_ok=True)
|
| 1429 |
all_data = []
|
| 1430 |
if creation_type == "Sintético":
|
|
|
|
| 159 |
|
| 160 |
@spaces.GPU()
|
| 161 |
def _create_deduplicated_iterable_dataset(dataset, text_col, method, threshold=0.85, num_perm=128):
|
| 162 |
+
lsh_state = MinHashLSH(threshold=threshold, num_perm=num_perm) if method == 'Semántica (MinHash)' else None
|
| 163 |
+
seen_texts_state = set() if method == 'Exacta' else None
|
| 164 |
def gen():
|
| 165 |
if method == 'Exacta':
|
|
|
|
| 166 |
for example in dataset:
|
| 167 |
text = example.get(text_col, "")
|
| 168 |
if text and isinstance(text, str):
|
| 169 |
+
if text not in seen_texts_state:
|
| 170 |
+
seen_texts_state.add(text)
|
| 171 |
yield example
|
| 172 |
else:
|
| 173 |
yield example
|
| 174 |
elif method == 'Semántica (MinHash)':
|
|
|
|
| 175 |
for i, example in enumerate(dataset):
|
| 176 |
text = example.get(text_col, "")
|
| 177 |
if text and isinstance(text, str) and text.strip():
|
| 178 |
m = MinHash(num_perm=num_perm)
|
| 179 |
for d in text.split():
|
| 180 |
m.update(d.encode('utf8'))
|
| 181 |
+
if not lsh_state.query(m):
|
| 182 |
+
lsh_state.insert(f"key_{i}", m)
|
| 183 |
yield example
|
| 184 |
else:
|
| 185 |
yield example
|
|
|
|
| 1424 |
login(token=hf_token)
|
| 1425 |
user = whoami()
|
| 1426 |
username = user.get("name")
|
| 1427 |
+
repo_base = f"{username}-{uuid.uuid4().hex[:6]}" if not repo_name else re.sub(r'[^a-zA-Z0-9_.-]+', '-', repo_name)[:90]
|
| 1428 |
+
repo_id = f"{username}/{repo_base}"
|
| 1429 |
create_repo(repo_id, repo_type="dataset", exist_ok=True)
|
| 1430 |
all_data = []
|
| 1431 |
if creation_type == "Sintético":
|