Train_xd

Runtime error

App Files Files Community

Ignaciohhhhggfgjfrffd commited on Nov 10

Commit

cf8cb14

verified ·

1 Parent(s): a4ff010

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -7

app.py CHANGED Viewed

@@ -159,27 +159,27 @@ class DebiasingSFTTrainer(SFTTrainer):
 @spaces.GPU()
 def _create_deduplicated_iterable_dataset(dataset, text_col, method, threshold=0.85, num_perm=128):
     def gen():
         if method == 'Exacta':
-            seen_texts = set()
             for example in dataset:
                 text = example.get(text_col, "")
                 if text and isinstance(text, str):
-                    if text not in seen_texts:
-                        seen_texts.add(text)
                         yield example
                 else:
                     yield example
         elif method == 'Semántica (MinHash)':
-            lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
             for i, example in enumerate(dataset):
                 text = example.get(text_col, "")
                 if text and isinstance(text, str) and text.strip():
                     m = MinHash(num_perm=num_perm)
                     for d in text.split():
                         m.update(d.encode('utf8'))
-                    if not lsh.query(m):
-                        lsh.insert(f"key_{i}", m)
                         yield example
                 else:
                     yield example
@@ -1424,7 +1424,8 @@ def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, s
         login(token=hf_token)
         user = whoami()
         username = user.get("name")
-        repo_id = f"{username}/{repo_name}"
         create_repo(repo_id, repo_type="dataset", exist_ok=True)
         all_data = []
         if creation_type == "Sintético":

 @spaces.GPU()
 def _create_deduplicated_iterable_dataset(dataset, text_col, method, threshold=0.85, num_perm=128):
+    lsh_state = MinHashLSH(threshold=threshold, num_perm=num_perm) if method == 'Semántica (MinHash)' else None
+    seen_texts_state = set() if method == 'Exacta' else None
     def gen():
         if method == 'Exacta':
             for example in dataset:
                 text = example.get(text_col, "")
                 if text and isinstance(text, str):
+                    if text not in seen_texts_state:
+                        seen_texts_state.add(text)
                         yield example
                 else:
                     yield example
         elif method == 'Semántica (MinHash)':
             for i, example in enumerate(dataset):
                 text = example.get(text_col, "")
                 if text and isinstance(text, str) and text.strip():
                     m = MinHash(num_perm=num_perm)
                     for d in text.split():
                         m.update(d.encode('utf8'))
+                    if not lsh_state.query(m):
+                        lsh_state.insert(f"key_{i}", m)
                         yield example
                 else:
                     yield example
         login(token=hf_token)
         user = whoami()
         username = user.get("name")
+        repo_base = f"{username}-{uuid.uuid4().hex[:6]}" if not repo_name else re.sub(r'[^a-zA-Z0-9_.-]+', '-', repo_name)[:90]
+        repo_id = f"{username}/{repo_base}"
         create_repo(repo_id, repo_type="dataset", exist_ok=True)
         all_data = []
         if creation_type == "Sintético":