Spaces:
Sleeping
Sleeping
File size: 3,190 Bytes
1c38e8c 3118f3a 1360051 4457e86 aab722c 3118f3a 1360051 ce337fb f973f9e ce337fb bcd23af 3118f3a 4457e86 1360051 ce337fb ed60e85 3118f3a ed60e85 d90983a ce337fb 1360051 3118f3a 1360051 6adf923 c0a3abb a9d8b80 1360051 ce337fb 3118f3a 1360051 c0a3abb 1360051 3118f3a ce337fb 1360051 ce337fb f973f9e 1360051 2b37588 3118f3a 2b37588 1360051 6adf923 3118f3a 6adf923 fcb88c5 6adf923 bcd23af f973f9e ed60e85 1360051 ce337fb fcb88c5 3118f3a fcb88c5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | import streamlit as st
import tensorflow as tf
import numpy as np
import nltk
import os
from nltk.tokenize import sent_tokenize
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
# ๐ Hugging Face cache dir
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
# ๐ฅ Download NLTK punkt tokenizer
nltk_data_path = "/tmp/nltk_data"
nltk.download("punkt_tab", download_dir=nltk_data_path) # โ
fixed: should be "punkt", not "punkt_tab"
nltk.data.path.append(nltk_data_path)
# โ
Cache the model/tokenizer
@st.cache_resource
def load_model_and_tokenizer():
tokenizer = DistilBertTokenizerFast.from_pretrained(
"distilbert-base-uncased",
cache_dir="/tmp/huggingface"
)
model = TFDistilBertForSequenceClassification.from_pretrained(
"sundaram07/distilbert-sentence-classifier",
cache_dir="/tmp/huggingface"
)
return tokenizer, model
tokenizer, model = load_model_and_tokenizer()
# ๐ฎ Predict sentence AI probability
def predict_sentence_ai_probability(sentence):
inputs = tokenizer(sentence, return_tensors="tf", truncation=True, padding=True)
outputs = model(inputs)
logits = outputs.logits
prob_ai = tf.sigmoid(logits)[0][0].numpy()
return prob_ai
# ๐ Analyze text
def predict_ai_generated_percentage(text, threshold=0.15):
text = text.strip()
sentences = sent_tokenize(text)
if len(sentences) == 0:
return 0.0, []
ai_sentence_count = 0
results = []
for sentence in sentences:
prob = predict_sentence_ai_probability(sentence)
is_ai = prob <= threshold
results.append((sentence, prob, is_ai))
if is_ai:
ai_sentence_count += 1
ai_percentage = (ai_sentence_count / len(sentences)) * 100
return ai_percentage, results
# ๐ฅ๏ธ Streamlit UI
st.set_page_config(page_title="AI Detector", layout="wide")
st.title("๐ง AI Content Detector")
st.markdown(
"This app detects the percentage of **AI-generated content** using "
"sentence-level analysis with a fine-tuned DistilBERT model."
)
# ๐ Text input
user_input = st.text_area(
"๐ Paste your text below to check for AI-generated sentences:",
height=300
)
# ๐ค Output placeholder (to clear previous results)
output_container = st.empty()
# ๐ Analyze button logic
if st.button("๐ Analyze"):
if not user_input.strip():
st.warning("โ ๏ธ Please enter some text.")
else:
ai_percentage, analysis_results = predict_ai_generated_percentage(user_input)
if len(analysis_results) == 0:
st.warning("โ ๏ธ Not enough valid sentences to analyze.")
else:
with output_container.container():
st.subheader("๐ Sentence-level Analysis")
for i, (sentence, prob, is_ai) in enumerate(analysis_results, start=1):
label = "๐ข Human" if not is_ai else "๐ด AI"
st.markdown(f"**{i}.** _{sentence}_\n\n โ {label} (prob={prob:.3f})")
st.subheader("๐ Final Result")
st.success(f"Estimated **AI-generated content**: **{ai_percentage:.2f}%**")
|