Spaces:

yasserrmd
/

IntegrityChecker

Sleeping

App Files Files Community

yasserrmd commited on Nov 4, 2024

Commit

af005d6

verified ·

1 Parent(s): a9a3049

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -36

app.py CHANGED Viewed

@@ -32,55 +32,43 @@ CONFIG = synthid_mixin.DEFAULT_WATERMARKING_CONFIG
 def check_plagiarism(text):
     # Logits processor for SynthID
     logits_processor = logits_processing.SynthIDLogitsProcessor(
-        **CONFIG, top_k=TOP_K, temperature=TEMPERATURE
     )
     # Tokenize and process the input text
-    inputs = tokenizer(text, return_tensors="pt", padding=True).to(DEVICE)
-    inputs_len = inputs['input_ids'].shape[1]
-    # Generate output with model, capturing scores (logits)
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            do_sample=True,
-            max_length=1024,
-            temperature=TEMPERATURE,
-            top_k=TOP_K,
-            top_p=TOP_P,
-        )
-        # Extract the generated tokens from the model's predictions
-        generated_tokens = outputs[:, inputs_len:]
-        # Compute masks for watermark detection
-        eos_token_mask = logits_processor.compute_eos_token_mask(
-            input_ids=generated_tokens,
-            eos_token_id=tokenizer.eos_token_id,
-        )[:, CONFIG['ngram_len'] - 1 :]
-        context_repetition_mask = logits_processor.compute_context_repetition_mask(
-            input_ids=generated_tokens
-        )
-        # Combine the masks
-        combined_mask = context_repetition_mask * eos_token_mask
-        # Compute G values for the generated text
-        g_values = logits_processor.compute_g_values(input_ids=generated_tokens)
     # Score the G values with the combined mask
     score = mean_score(g_values.cpu().numpy(), combined_mask.cpu().numpy())
     # Initialize string to store highlighted output
     highlighted_text = ""
-    for token_id, g_val, mask in zip(generated_tokens[0], g_values[0], combined_mask[0]):
         token_text = tokenizer.decode(token_id.unsqueeze(0))
-        # If the token is part of the watermark, use a mean or max threshold on g_val if it's multi-element
-        if mask.item() and g_val.float().mean().item() > 0.5:  # Use .mean() to get a scalar value
             highlighted_text += f"<mark>{token_text}</mark>"  # Highlight watermarked content
         else:
             highlighted_text += token_text

 def check_plagiarism(text):
     # Logits processor for SynthID
     logits_processor = logits_processing.SynthIDLogitsProcessor(
+        **CONFIG, top_k=40, temperature=0.5
     )
     # Tokenize and process the input text
+    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
+    # Extract token IDs for the input text only
+    input_ids = inputs['input_ids']
+    # Compute masks for watermark detection
+    eos_token_mask = logits_processor.compute_eos_token_mask(
+        input_ids=input_ids,
+        eos_token_id=tokenizer.eos_token_id,
+    )[:, CONFIG['ngram_len'] - 1:]
+    context_repetition_mask = logits_processor.compute_context_repetition_mask(
+        input_ids=input_ids
+    )
+    # Combine the masks
+    combined_mask = context_repetition_mask * eos_token_mask
+    # Compute G values for the input text
+    g_values = logits_processor.compute_g_values(input_ids=input_ids)
     # Score the G values with the combined mask
     score = mean_score(g_values.cpu().numpy(), combined_mask.cpu().numpy())
     # Initialize string to store highlighted output
     highlighted_text = ""
+    # Loop through each token in the input text and apply highlighting if it meets the watermark criteria
+    for token_id, g_val, mask in zip(input_ids[0], g_values[0], combined_mask[0]):
         token_text = tokenizer.decode(token_id.unsqueeze(0))
+        # Convert g_val to float and highlight if it meets the threshold
+        if mask.item() and g_val.float().mean().item() > 0.5:
             highlighted_text += f"<mark>{token_text}</mark>"  # Highlight watermarked content
         else:
             highlighted_text += token_text