File size: 10,740 Bytes
539cdde
b04beef
 
539cdde
b04beef
 
 
 
 
 
 
d2de988
539cdde
b04beef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cad8c2
b04beef
539cdde
 
6cad8c2
 
 
b04beef
 
 
 
 
 
6cad8c2
b04beef
539cdde
b04beef
 
 
 
539cdde
 
 
155070e
539cdde
 
 
 
 
 
 
 
 
155070e
539cdde
 
 
 
 
 
155070e
539cdde
 
 
 
155070e
539cdde
 
 
155070e
539cdde
 
155070e
 
 
 
 
539cdde
 
 
b04beef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539cdde
b04beef
 
 
539cdde
 
 
155070e
539cdde
 
 
155070e
539cdde
 
 
b04beef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539cdde
b04beef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import difflib
import spacy
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from collections import Counter
import uvicorn
import os
import torch

# Download NLTK resources
try:
    nltk.download('vader_lexicon', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    print("Could not download NLTK resources. Some features may be limited.")

app = FastAPI()

# Configure CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allows all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allows all methods
    allow_headers=["*"],  # Allows all headers
)

# Global variable for the pipeline
humanize_pipe = None

# Load NLP models
try:
    # Load spaCy model
    nlp = spacy.load("en_core_web_sm")
    
    # Initialize sentiment analyzer
    sentiment_analyzer = SentimentIntensityAnalyzer()
    
    print("NLP models loaded successfully!")
except Exception as e:
    print(f"Error loading NLP models: {e}")
    # Create fallback functions if models fail to load
    def mock_function(text):
        return "Model could not be loaded. This is a fallback response."

def get_humanize_pipeline():
    """
    Lazy-load the humanization pipeline on first use.
    Uses standard settings that don't require accelerate.
    """
    global humanize_pipe
    if humanize_pipe is None:
        try:
            print("Loading the humanizer model on CPU...")
            
            # Force CPU usage
            device = torch.device("cpu")
            
            # Load model with basic settings (no accelerate needed)
            model = AutoModelForSeq2SeqLM.from_pretrained(
                "danibor/flan-t5-base-humanizer", 
                torch_dtype=torch.float32  # Use float32 instead of float16 for CPU
            )
            tokenizer = AutoTokenizer.from_pretrained("danibor/flan-t5-base-humanizer")
            
            # Create pipeline with basic settings
            humanize_pipe = pipeline(
                "text2text-generation", 
                model=model, 
                tokenizer=tokenizer,
                device=device  # Explicitly specify CPU
            )
            
            print("Humanizer model loaded successfully!")
            return humanize_pipe
        except Exception as e:
            print(f"Error loading humanizer model: {e}")
            # Create a simple pipeline-like function that just returns the input
            def simple_pipeline(text, **kwargs):
                return [{"generated_text": f"Could not process: {text} (Model failed to load)"}]
            humanize_pipe = simple_pipeline
            return humanize_pipe
    
    return humanize_pipe

# Define request models
class TextRequest(BaseModel):
    text: str

class HumanizeResponse(BaseModel):
    original_text: str
    humanized_text: str
    diff: list
    original_word_count: int
    humanized_word_count: int
    nlp_analysis: dict

class AnalyzeResponse(BaseModel):
    text: str
    word_count: int
    sentiment: dict
    entities: dict
    key_phrases: list
    readability: dict
    complexity: dict

@app.post("/humanize", response_model=HumanizeResponse)
async def humanize_text(request: TextRequest):
    input_text = request.text
    
    try:
        # Get or initialize the pipeline
        pipeline = get_humanize_pipeline()
        
        # Generate humanized text with basic settings
        result = pipeline(
            input_text, 
            max_length=min(500, len(input_text) * 2),  # Limit max length
            do_sample=True
        )
        
        humanized_text = result[0]['generated_text']
        
        # Get the differences
        diff = get_diff(input_text, humanized_text)
        
        # Process both texts with NLP
        nlp_analysis = perform_nlp_analysis(input_text, humanized_text)
        
        return {
            'original_text': input_text,
            'humanized_text': humanized_text,
            'diff': diff,
            'original_word_count': len(input_text.split()),
            'humanized_word_count': len(humanized_text.split()),
            'nlp_analysis': nlp_analysis
        }
    except Exception as e:
        print(f"Error in humanize endpoint: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}")

def get_diff(text1, text2):
    """
    Generate a list of changes between two texts.
    Returns a list of tuples (operation, text)
    where operation is '+' for addition, '-' for deletion, or ' ' for unchanged.
    """
    d = difflib.Differ()
    diff = list(d.compare(text1.split(), text2.split()))
    
    result = []
    for item in diff:
        operation = item[0]
        if operation in ['+', '-', ' ']:
            text = item[2:]
            result.append({'operation': operation, 'text': text})
    
    return result

def perform_nlp_analysis(original_text, humanized_text):
    """
    Perform comprehensive NLP analysis on both original and humanized text.
    """
    result = {}
    
    # Process both texts with spaCy
    original_doc = nlp(original_text)
    humanized_doc = nlp(humanized_text)
    
    # Sentiment analysis
    original_sentiment = sentiment_analyzer.polarity_scores(original_text)
    humanized_sentiment = sentiment_analyzer.polarity_scores(humanized_text)
    
    # Extract named entities
    original_entities = extract_entities(original_doc)
    humanized_entities = extract_entities(humanized_doc)
    
    # Extract key phrases using noun chunks
    original_phrases = extract_key_phrases(original_doc)
    humanized_phrases = extract_key_phrases(humanized_doc)
    
    # Readability metrics
    original_readability = calculate_readability(original_text)
    humanized_readability = calculate_readability(humanized_text)
    
    # Complexity metrics
    original_complexity = analyze_complexity(original_doc)
    humanized_complexity = analyze_complexity(humanized_doc)
    
    # Compile all results
    result = {
        'original': {
            'sentiment': original_sentiment,
            'entities': original_entities,
            'key_phrases': original_phrases,
            'readability': original_readability,
            'complexity': original_complexity
        },
        'humanized': {
            'sentiment': humanized_sentiment,
            'entities': humanized_entities,
            'key_phrases': humanized_phrases,
            'readability': humanized_readability,
            'complexity': humanized_complexity
        }
    }
    
    return result

def extract_entities(doc):
    """Extract and categorize named entities from a spaCy document."""
    entities = {}
    for ent in doc.ents:
        if ent.label_ not in entities:
            entities[ent.label_] = []
        if ent.text not in entities[ent.label_]:
            entities[ent.label_].append(ent.text)
    return entities

def extract_key_phrases(doc):
    """Extract key phrases using noun chunks."""
    return [chunk.text for chunk in doc.noun_chunks][:10]  # Limit to top 10

def calculate_readability(text):
    """Calculate basic readability metrics."""
    # Count sentences
    sentences = len(list(nltk.sent_tokenize(text)))
    if sentences == 0:
        sentences = 1  # Avoid division by zero
    
    # Count words
    words = len(text.split())
    if words == 0:
        words = 1  # Avoid division by zero
    
    # Average words per sentence
    avg_words_per_sentence = words / sentences
    
    # Count syllables (simplified approach)
    syllables = count_syllables(text)
    
    # Calculate Flesch Reading Ease
    flesch = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
    
    return {
        'sentence_count': sentences,
        'word_count': words,
        'avg_words_per_sentence': round(avg_words_per_sentence, 2),
        'syllable_count': syllables,
        'flesch_reading_ease': round(flesch, 2)
    }

def count_syllables(text):
    """Count syllables in text (simplified approach)."""
    # This is a simplified syllable counter
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    
    count = 0
    for word in words:
        word = word.strip()
        if not word:
            continue
            
        # Count vowel groups as syllables
        if word[-1] == 'e':
            word = word[:-1]
            
        vowel_count = len(re.findall(r'[aeiouy]+', word))
        if vowel_count == 0:
            vowel_count = 1
            
        count += vowel_count
        
    return count

def analyze_complexity(doc):
    """Analyze text complexity using POS tags and dependency parsing."""
    # Count POS tags
    pos_counts = Counter([token.pos_ for token in doc])
    
    # Calculate lexical diversity
    total_tokens = len(doc)
    unique_tokens = len(set([token.text.lower() for token in doc]))
    
    lexical_diversity = unique_tokens / total_tokens if total_tokens > 0 else 0
    
    # Count dependency relationship types
    dep_counts = Counter([token.dep_ for token in doc])
    
    return {
        'pos_distribution': dict(pos_counts),
        'lexical_diversity': round(lexical_diversity, 4),
        'dependency_types': dict(dep_counts)
    }

@app.post("/analyze", response_model=AnalyzeResponse)
async def analyze_text(request: TextRequest):
    """Endpoint to just analyze text without humanizing it."""
    input_text = request.text
    
    try:
        # Process text with NLP
        doc = nlp(input_text)
        
        # Analyze text
        sentiment = sentiment_analyzer.polarity_scores(input_text)
        entities = extract_entities(doc)
        key_phrases = extract_key_phrases(doc)
        readability = calculate_readability(input_text)
        complexity = analyze_complexity(doc)
        
        return {
            'text': input_text,
            'word_count': len(input_text.split()),
            'sentiment': sentiment,
            'entities': entities,
            'key_phrases': key_phrases,
            'readability': readability,
            'complexity': complexity
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error analyzing text: {str(e)}")

# Add a root endpoint for Hugging Face Spaces health check
@app.get("/")
async def root():
    return {"message": "Text Analysis and Humanization API is running!"}

# For local development
if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)