Spaces:
Sleeping
Sleeping
File size: 10,740 Bytes
539cdde b04beef 539cdde b04beef d2de988 539cdde b04beef 6cad8c2 b04beef 539cdde 6cad8c2 b04beef 6cad8c2 b04beef 539cdde b04beef 539cdde 155070e 539cdde 155070e 539cdde 155070e 539cdde 155070e 539cdde 155070e 539cdde 155070e 539cdde b04beef 539cdde b04beef 539cdde 155070e 539cdde 155070e 539cdde b04beef 539cdde b04beef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 |
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import difflib
import spacy
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from collections import Counter
import uvicorn
import os
import torch
# Download NLTK resources
try:
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
except:
print("Could not download NLTK resources. Some features may be limited.")
app = FastAPI()
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allows all origins
allow_credentials=True,
allow_methods=["*"], # Allows all methods
allow_headers=["*"], # Allows all headers
)
# Global variable for the pipeline
humanize_pipe = None
# Load NLP models
try:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")
# Initialize sentiment analyzer
sentiment_analyzer = SentimentIntensityAnalyzer()
print("NLP models loaded successfully!")
except Exception as e:
print(f"Error loading NLP models: {e}")
# Create fallback functions if models fail to load
def mock_function(text):
return "Model could not be loaded. This is a fallback response."
def get_humanize_pipeline():
"""
Lazy-load the humanization pipeline on first use.
Uses standard settings that don't require accelerate.
"""
global humanize_pipe
if humanize_pipe is None:
try:
print("Loading the humanizer model on CPU...")
# Force CPU usage
device = torch.device("cpu")
# Load model with basic settings (no accelerate needed)
model = AutoModelForSeq2SeqLM.from_pretrained(
"danibor/flan-t5-base-humanizer",
torch_dtype=torch.float32 # Use float32 instead of float16 for CPU
)
tokenizer = AutoTokenizer.from_pretrained("danibor/flan-t5-base-humanizer")
# Create pipeline with basic settings
humanize_pipe = pipeline(
"text2text-generation",
model=model,
tokenizer=tokenizer,
device=device # Explicitly specify CPU
)
print("Humanizer model loaded successfully!")
return humanize_pipe
except Exception as e:
print(f"Error loading humanizer model: {e}")
# Create a simple pipeline-like function that just returns the input
def simple_pipeline(text, **kwargs):
return [{"generated_text": f"Could not process: {text} (Model failed to load)"}]
humanize_pipe = simple_pipeline
return humanize_pipe
return humanize_pipe
# Define request models
class TextRequest(BaseModel):
text: str
class HumanizeResponse(BaseModel):
original_text: str
humanized_text: str
diff: list
original_word_count: int
humanized_word_count: int
nlp_analysis: dict
class AnalyzeResponse(BaseModel):
text: str
word_count: int
sentiment: dict
entities: dict
key_phrases: list
readability: dict
complexity: dict
@app.post("/humanize", response_model=HumanizeResponse)
async def humanize_text(request: TextRequest):
input_text = request.text
try:
# Get or initialize the pipeline
pipeline = get_humanize_pipeline()
# Generate humanized text with basic settings
result = pipeline(
input_text,
max_length=min(500, len(input_text) * 2), # Limit max length
do_sample=True
)
humanized_text = result[0]['generated_text']
# Get the differences
diff = get_diff(input_text, humanized_text)
# Process both texts with NLP
nlp_analysis = perform_nlp_analysis(input_text, humanized_text)
return {
'original_text': input_text,
'humanized_text': humanized_text,
'diff': diff,
'original_word_count': len(input_text.split()),
'humanized_word_count': len(humanized_text.split()),
'nlp_analysis': nlp_analysis
}
except Exception as e:
print(f"Error in humanize endpoint: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}")
def get_diff(text1, text2):
"""
Generate a list of changes between two texts.
Returns a list of tuples (operation, text)
where operation is '+' for addition, '-' for deletion, or ' ' for unchanged.
"""
d = difflib.Differ()
diff = list(d.compare(text1.split(), text2.split()))
result = []
for item in diff:
operation = item[0]
if operation in ['+', '-', ' ']:
text = item[2:]
result.append({'operation': operation, 'text': text})
return result
def perform_nlp_analysis(original_text, humanized_text):
"""
Perform comprehensive NLP analysis on both original and humanized text.
"""
result = {}
# Process both texts with spaCy
original_doc = nlp(original_text)
humanized_doc = nlp(humanized_text)
# Sentiment analysis
original_sentiment = sentiment_analyzer.polarity_scores(original_text)
humanized_sentiment = sentiment_analyzer.polarity_scores(humanized_text)
# Extract named entities
original_entities = extract_entities(original_doc)
humanized_entities = extract_entities(humanized_doc)
# Extract key phrases using noun chunks
original_phrases = extract_key_phrases(original_doc)
humanized_phrases = extract_key_phrases(humanized_doc)
# Readability metrics
original_readability = calculate_readability(original_text)
humanized_readability = calculate_readability(humanized_text)
# Complexity metrics
original_complexity = analyze_complexity(original_doc)
humanized_complexity = analyze_complexity(humanized_doc)
# Compile all results
result = {
'original': {
'sentiment': original_sentiment,
'entities': original_entities,
'key_phrases': original_phrases,
'readability': original_readability,
'complexity': original_complexity
},
'humanized': {
'sentiment': humanized_sentiment,
'entities': humanized_entities,
'key_phrases': humanized_phrases,
'readability': humanized_readability,
'complexity': humanized_complexity
}
}
return result
def extract_entities(doc):
"""Extract and categorize named entities from a spaCy document."""
entities = {}
for ent in doc.ents:
if ent.label_ not in entities:
entities[ent.label_] = []
if ent.text not in entities[ent.label_]:
entities[ent.label_].append(ent.text)
return entities
def extract_key_phrases(doc):
"""Extract key phrases using noun chunks."""
return [chunk.text for chunk in doc.noun_chunks][:10] # Limit to top 10
def calculate_readability(text):
"""Calculate basic readability metrics."""
# Count sentences
sentences = len(list(nltk.sent_tokenize(text)))
if sentences == 0:
sentences = 1 # Avoid division by zero
# Count words
words = len(text.split())
if words == 0:
words = 1 # Avoid division by zero
# Average words per sentence
avg_words_per_sentence = words / sentences
# Count syllables (simplified approach)
syllables = count_syllables(text)
# Calculate Flesch Reading Ease
flesch = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
return {
'sentence_count': sentences,
'word_count': words,
'avg_words_per_sentence': round(avg_words_per_sentence, 2),
'syllable_count': syllables,
'flesch_reading_ease': round(flesch, 2)
}
def count_syllables(text):
"""Count syllables in text (simplified approach)."""
# This is a simplified syllable counter
text = text.lower()
text = re.sub(r'[^a-zA-Z]', ' ', text)
words = text.split()
count = 0
for word in words:
word = word.strip()
if not word:
continue
# Count vowel groups as syllables
if word[-1] == 'e':
word = word[:-1]
vowel_count = len(re.findall(r'[aeiouy]+', word))
if vowel_count == 0:
vowel_count = 1
count += vowel_count
return count
def analyze_complexity(doc):
"""Analyze text complexity using POS tags and dependency parsing."""
# Count POS tags
pos_counts = Counter([token.pos_ for token in doc])
# Calculate lexical diversity
total_tokens = len(doc)
unique_tokens = len(set([token.text.lower() for token in doc]))
lexical_diversity = unique_tokens / total_tokens if total_tokens > 0 else 0
# Count dependency relationship types
dep_counts = Counter([token.dep_ for token in doc])
return {
'pos_distribution': dict(pos_counts),
'lexical_diversity': round(lexical_diversity, 4),
'dependency_types': dict(dep_counts)
}
@app.post("/analyze", response_model=AnalyzeResponse)
async def analyze_text(request: TextRequest):
"""Endpoint to just analyze text without humanizing it."""
input_text = request.text
try:
# Process text with NLP
doc = nlp(input_text)
# Analyze text
sentiment = sentiment_analyzer.polarity_scores(input_text)
entities = extract_entities(doc)
key_phrases = extract_key_phrases(doc)
readability = calculate_readability(input_text)
complexity = analyze_complexity(doc)
return {
'text': input_text,
'word_count': len(input_text.split()),
'sentiment': sentiment,
'entities': entities,
'key_phrases': key_phrases,
'readability': readability,
'complexity': complexity
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error analyzing text: {str(e)}")
# Add a root endpoint for Hugging Face Spaces health check
@app.get("/")
async def root():
return {"message": "Text Analysis and Humanization API is running!"}
# For local development
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True) |