import gradio as gr import torch import transformers import spaces from synthid_text import synthid_mixin,logits_processing from synthid_text.detector_mean import mean_score # Configurations and model selection MODEL_NAME = "google/gemma-7b-it" DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') TOP_K = 40 TOP_P = 0.99 TEMPERATURE= 0.5 # Initialize model and tokenizer # model = synthid_mixin.SynthIDGemmaForCausalLM.from_pretrained( # MODEL_NAME, # device_map=DEVICE, # torch_dtype=torch.bfloat16, # ) tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" # Watermarking configuration CONFIG = synthid_mixin.DEFAULT_WATERMARKING_CONFIG # Function to check for AI-generated content using SynthID and highlight watermark @spaces.GPU def check_plagiarism(text): # Logits processor for SynthID logits_processor = logits_processing.SynthIDLogitsProcessor( **CONFIG, top_k=40, temperature=0.5 ) # Tokenize and process the input text inputs = tokenizer(text, return_tensors="pt").to(DEVICE) # Extract token IDs for the input text only input_ids = inputs['input_ids'] # Compute masks for watermark detection eos_token_mask = logits_processor.compute_eos_token_mask( input_ids=input_ids, eos_token_id=tokenizer.eos_token_id, )[:, CONFIG['ngram_len'] - 1:] context_repetition_mask = logits_processor.compute_context_repetition_mask( input_ids=input_ids ) # Combine the masks combined_mask = context_repetition_mask * eos_token_mask # Compute G values for the input text g_values = logits_processor.compute_g_values(input_ids=input_ids) # Score the G values with the combined mask score = mean_score(g_values.cpu().numpy(), combined_mask.cpu().numpy()) # Initialize string to store highlighted output highlighted_text = "" # Loop through each token in the input text and apply highlighting if it meets the watermark criteria for token_id, g_val, mask in zip(input_ids[0], g_values[0], combined_mask[0]): token_text = tokenizer.decode(token_id.unsqueeze(0)) # Convert g_val to float and highlight if it meets the threshold if mask.item() and g_val.float().mean().item() > 0.55: highlighted_text += f"{token_text}" # Highlight watermarked content else: highlighted_text += token_text # Return the highlighted text and overall watermark score if score > 0.5: return f"Flagged as AI-generated content (Academic Integrity Warning): {highlighted_text}" else: return f"Content appears to be human-generated. {highlighted_text}" # Define the Gradio interface def create_plagiarism_checker(): with gr.Blocks() as app: # Add custom CSS styling using gr.HTML gr.HTML(""" """) # Title and description gr.Markdown(""" # 📝 Plagiarism and Academic Integrity Checker Use this tool to detect AI-generated content in your text using SynthID technology. Paste your text below to check if it contains AI-generated segments. --- """) # Layout the components with gr.Row(): # Input textbox for users to paste text text_input = gr.Textbox( placeholder="Paste your text here...", label="Input Text", lines=10, max_lines=20, elem_id="text_input", ) # Divider for clarity gr.Markdown("---") # Output box to display the result with highlighted watermark output = gr.HTML(label="Integrity Check Result", elem_id="output") # Button to initiate the check, styled with a color accent check_button = gr.Button("🔍 Check Text", elem_id="check_button") # Define the click event for the button check_button.click(fn=check_plagiarism, inputs=text_input, outputs=output) return app # Launch the app plagiarism_checker_app = create_plagiarism_checker() plagiarism_checker_app.launch()