Spaces:

dejanseo
/

ai-detection

Sleeping

App Files Files Community

dejanseo commited on Apr 14, 2025

Commit

432c5a1

verified ·

1 Parent(s): 4cc104c

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -66

app.py CHANGED Viewed

@@ -3,6 +3,11 @@ import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
 # Set the page configuration
 st.set_page_config(
@@ -15,7 +20,7 @@ st.set_page_config(
 st.logo(
     image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
     link="https://dejan.ai/",
-    size="large"
 )
 # Font styling
@@ -28,88 +33,171 @@ st.markdown("""
 </style>
 """, unsafe_allow_html=True)
-# Load model and tokenizer
 MODEL_NAME = "dejanseo/ai-detection"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForSequenceClassification.from_pretrained(
-    MODEL_NAME,
-    device_map="auto",
-    torch_dtype=torch.float32  # ensure safe fallback on CPU
-)
-model.eval()
 # Static settings
 LABELS = ["AI Content", "Human Content"]
 COLORS = ["#ffe5e5", "#e6ffe6"]  # light red, light green
-# Regex-based sentence splitter
 def sent_tokenize(text):
-    return re.split(r'(?<=[.!?]) +', text.strip())
-def split_into_chunks(text, max_length=512):
     sentences = sent_tokenize(text)
-    chunks, current_chunk, current_len = [], [], 0
     for sent in sentences:
-        token_len = len(tokenizer.tokenize(sent))
-        if current_len + token_len <= max_length - 2:
-            current_chunk.append(sent)
             current_len += token_len
         else:
-            if current_chunk:
-                chunks.append(" ".join(current_chunk))
-            current_chunk = [sent]
             current_len = token_len
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
     return chunks
-# UI
 st.title("AI Article Detection")
-text = st.text_area("Enter text to classify", height=100)
-if st.button("Classify"):
-    if not text.strip():
         st.warning("Please enter some text.")
     else:
-        with st.spinner("Analyzing..."):
-            chunks = split_into_chunks(text)
-            inputs = tokenizer(chunks, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
-            with torch.no_grad():
-                outputs = model(**inputs)
-                logits = outputs.logits
-                probs = F.softmax(logits, dim=-1)
-                preds = torch.argmax(probs, dim=-1)
-            chunk_results = []
-            for i, chunk in enumerate(chunks):
-                pred = int(preds[i].item())
-                chunk_results.append({
-                    "text": chunk,
-                    "label": LABELS[pred],
-                    "color": COLORS[pred],
-                    "conf": probs[i][pred].item() * 100,
-                })
-            avg_probs = torch.mean(probs, dim=0).tolist()
-            final_class = int(torch.argmax(torch.tensor(avg_probs)).item())
-            final_label = LABELS[final_class]
-            final_conf = avg_probs[final_class] * 100
-        st.subheader("📊 Final Prediction")
-        st.markdown(
-            f"<div style='background-color:{COLORS[final_class]}; padding:1rem; border-radius:0.5rem'>"
-            f"<b>{final_label}</b> ({final_conf:.1f}%)</div>",
-            unsafe_allow_html=True
-        )
-        with st.expander("See per-chunk predictions"):
-            for result in chunk_results:
-                st.markdown(
-                    f"<div title='Confidence: {result['conf']:.1f}%' "
-                    f"style='background-color:{result['color']}; padding:0.75rem; margin-bottom:0.5rem; border-radius:0.5rem'>"
-                    f"{result['text']}</div>",
-                    unsafe_allow_html=True
-                )

 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
+import logging # Optional: Add logging for better debugging
+# Set up logging (optional but helpful)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Set the page configuration
 st.set_page_config(
 st.logo(
     image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
     link="https://dejan.ai/",
+    # size="large" # 'size' is not a valid argument for st.logo as of Streamlit 1.34 - remove or adjust if needed
 )
 # Font styling
 </style>
 """, unsafe_allow_html=True)
+@st.cache_resource # Cache the model and tokenizer to avoid reloading on every interaction
+def load_model_and_tokenizer(model_name):
+    """Loads the model and tokenizer."""
+    logger.info(f"Loading tokenizer: {model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Determine device
+    device_type = "cuda" if torch.cuda.is_available() else "cpu"
+    # Use bfloat16 if available on CUDA for potential speedup/memory saving, else float32
+    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
+    logger.info(f"Using device: {device_type} with dtype: {dtype}")
+    logger.info(f"Loading model: {model_name}")
+    # Load model onto CPU first, then move to target device
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        torch_dtype=dtype # Use the determined dtype
+        # Removed device_map="auto"
+    )
+    logger.info("Moving model to target device...")
+    model.to(torch.device(device_type)) # Move the entire model to the target device
+    model.eval() # Set model to evaluation mode
+    logger.info("Model loaded successfully.")
+    return tokenizer, model, torch.device(device_type)
+# Load model and tokenizer using the cached function
 MODEL_NAME = "dejanseo/ai-detection"
+try:
+    tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
+except Exception as e:
+    st.error(f"Error loading model: {e}")
+    logger.error(f"Failed to load model or tokenizer: {e}", exc_info=True)
+    st.stop() # Stop execution if model loading fails
 # Static settings
 LABELS = ["AI Content", "Human Content"]
 COLORS = ["#ffe5e5", "#e6ffe6"]  # light red, light green
+# Regex-based sentence splitter (improved slightly for robustness)
 def sent_tokenize(text):
+    # Split by '.', '!', '?' followed by space(s) or end of string
+    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+    # Filter out empty strings that might result from splitting
+    return [s for s in sentences if s]
+def split_into_chunks(text, tokenizer, max_length=512):
     sentences = sent_tokenize(text)
+    if not sentences:
+        return [] # Handle empty input after tokenization
+    chunks, current_chunk_sentences, current_len = [], [], 0
+    max_tokens = max_length - 2 # Account for [CLS] and [SEP] tokens
     for sent in sentences:
+        # Use tokenizer.encode to get accurate token count (more reliable than tokenize)
+        token_ids = tokenizer.encode(sent, add_special_tokens=False)
+        token_len = len(token_ids)
+        if token_len > max_tokens:
+            # Sentence is too long even by itself, handle appropriately
+            # Option 1: Truncate the sentence (simplest)
+            logger.warning(f"Sentence truncated as it exceeds max_length: '{sent[:100]}...'")
+            truncated_sent = tokenizer.decode(token_ids[:max_tokens])
+            # If there was a previous chunk, add it first
+            if current_chunk_sentences:
+                 chunks.append(" ".join(current_chunk_sentences))
+            chunks.append(truncated_sent) # Add the single truncated sentence as its own chunk
+            current_chunk_sentences, current_len = [], 0 # Reset chunk
+            continue # Move to the next sentence
+        if current_len + token_len <= max_tokens:
+            current_chunk_sentences.append(sent)
             current_len += token_len
         else:
+            # Current chunk is full, finalize it
+            if current_chunk_sentences:
+                chunks.append(" ".join(current_chunk_sentences))
+            # Start a new chunk with the current sentence
+            current_chunk_sentences = [sent]
             current_len = token_len
+    # Add the last remaining chunk
+    if current_chunk_sentences:
+        chunks.append(" ".join(current_chunk_sentences))
     return chunks
+# --- UI ---
 st.title("AI Article Detection")
+text = st.text_area("Enter text to classify", height=150, placeholder="Paste your text here...")
+if st.button("Classify", type="primary"):
+    if not text or not text.strip():
         st.warning("Please enter some text.")
     else:
+        with st.spinner("Analyzing... Please wait."):
+            try:
+                # Split text using the tokenizer reference
+                chunks = split_into_chunks(text, tokenizer, max_length=model.config.max_position_embeddings)
+                logger.info(f"Split text into {len(chunks)} chunks.")
+                if not chunks:
+                     st.warning("Could not process the input text (perhaps it's too short or contains only delimiters?).")
+                     st.stop()
+                # Tokenize chunks and move tensors to the correct device
+                inputs = tokenizer(
+                    chunks,
+                    return_tensors="pt",
+                    padding=True,         # Pad sequences to the longest in the batch
+                    truncation=True,      # Truncate sequences longer than max_length
+                    max_length=model.config.max_position_embeddings # Use model's max length
+                ).to(device) # Move inputs to the same device as the model
+                # Perform inference
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    logits = outputs.logits
+                    # Ensure probabilities are calculated on CPU if needed for aggregation later
+                    probs = F.softmax(logits, dim=-1).cpu() # Move probs to CPU
+                    preds = torch.argmax(probs, dim=-1) # Argmax on CPU probabilities
+                # Process results
+                chunk_results = []
+                for i, chunk in enumerate(chunks):
+                    pred_index = preds[i].item() # Get prediction index for this chunk
+                    chunk_results.append({
+                        "text": chunk,
+                        "label": LABELS[pred_index],
+                        "color": COLORS[pred_index],
+                        "conf": probs[i, pred_index].item() * 100, # Get confidence for the predicted class
+                    })
+                # Calculate overall prediction based on average probability across chunks
+                if probs.numel() > 0: # Check if probs tensor is not empty
+                    avg_probs = torch.mean(probs, dim=0) # Average probabilities across the batch dimension
+                    final_class_index = torch.argmax(avg_probs).item()
+                    final_label = LABELS[final_class_index]
+                    final_conf = avg_probs[final_class_index].item() * 100
+                    # Display final prediction
+                    st.subheader("📊 Final Prediction")
+                    st.markdown(
+                        f"<div style='background-color:{COLORS[final_class_index]}; padding:1rem; border-radius:0.5rem; border: 1px solid #ccc;'>"
+                        f"Based on the analysis, the text is most likely: <b>{final_label}</b> (Confidence: {final_conf:.1f}%)</div>",
+                        unsafe_allow_html=True
+                    )
+                else:
+                    st.warning("Could not generate predictions for the provided text.")
+                # Display per-chunk predictions in an expander
+                with st.expander("See per-chunk predictions and confidence"):
+                    if chunk_results:
+                        for result in chunk_results:
+                            st.markdown(
+                                f"<div title='Confidence: {result['conf']:.1f}%' "
+                                f"style='background-color:{result['color']}; padding:0.75rem; margin-bottom:0.5rem; border-radius:0.5rem; border: 1px solid #ddd;'>"
+                                f"<i>({result['label']} - {result['conf']:.1f}%)</i><br>{result['text']}</div>",
+                                unsafe_allow_html=True
+                            )
+                    else:
+                         st.write("No chunk predictions were generated.")
+            except Exception as e:
+                st.error(f"An error occurred during analysis: {e}")
+                logger.error(f"Analysis failed: {e}", exc_info=True)