Ignaciohhhhggfgjfrffd commited on
Commit
cf8cb14
·
verified ·
1 Parent(s): a4ff010

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -159,27 +159,27 @@ class DebiasingSFTTrainer(SFTTrainer):
159
 
160
  @spaces.GPU()
161
  def _create_deduplicated_iterable_dataset(dataset, text_col, method, threshold=0.85, num_perm=128):
 
 
162
  def gen():
163
  if method == 'Exacta':
164
- seen_texts = set()
165
  for example in dataset:
166
  text = example.get(text_col, "")
167
  if text and isinstance(text, str):
168
- if text not in seen_texts:
169
- seen_texts.add(text)
170
  yield example
171
  else:
172
  yield example
173
  elif method == 'Semántica (MinHash)':
174
- lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
175
  for i, example in enumerate(dataset):
176
  text = example.get(text_col, "")
177
  if text and isinstance(text, str) and text.strip():
178
  m = MinHash(num_perm=num_perm)
179
  for d in text.split():
180
  m.update(d.encode('utf8'))
181
- if not lsh.query(m):
182
- lsh.insert(f"key_{i}", m)
183
  yield example
184
  else:
185
  yield example
@@ -1424,7 +1424,8 @@ def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, s
1424
  login(token=hf_token)
1425
  user = whoami()
1426
  username = user.get("name")
1427
- repo_id = f"{username}/{repo_name}"
 
1428
  create_repo(repo_id, repo_type="dataset", exist_ok=True)
1429
  all_data = []
1430
  if creation_type == "Sintético":
 
159
 
160
  @spaces.GPU()
161
  def _create_deduplicated_iterable_dataset(dataset, text_col, method, threshold=0.85, num_perm=128):
162
+ lsh_state = MinHashLSH(threshold=threshold, num_perm=num_perm) if method == 'Semántica (MinHash)' else None
163
+ seen_texts_state = set() if method == 'Exacta' else None
164
  def gen():
165
  if method == 'Exacta':
 
166
  for example in dataset:
167
  text = example.get(text_col, "")
168
  if text and isinstance(text, str):
169
+ if text not in seen_texts_state:
170
+ seen_texts_state.add(text)
171
  yield example
172
  else:
173
  yield example
174
  elif method == 'Semántica (MinHash)':
 
175
  for i, example in enumerate(dataset):
176
  text = example.get(text_col, "")
177
  if text and isinstance(text, str) and text.strip():
178
  m = MinHash(num_perm=num_perm)
179
  for d in text.split():
180
  m.update(d.encode('utf8'))
181
+ if not lsh_state.query(m):
182
+ lsh_state.insert(f"key_{i}", m)
183
  yield example
184
  else:
185
  yield example
 
1424
  login(token=hf_token)
1425
  user = whoami()
1426
  username = user.get("name")
1427
+ repo_base = f"{username}-{uuid.uuid4().hex[:6]}" if not repo_name else re.sub(r'[^a-zA-Z0-9_.-]+', '-', repo_name)[:90]
1428
+ repo_id = f"{username}/{repo_base}"
1429
  create_repo(repo_id, repo_type="dataset", exist_ok=True)
1430
  all_data = []
1431
  if creation_type == "Sintético":