Spaces:

sanketshinde3001
/

TextConvert

Sleeping

App Files Files Community

TextConvert / app.py

sanketshinde3001

Update app.py

155070e verified 10 months ago

raw

history blame contribute delete

10.7 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
	import difflib
	import spacy
	import re
	from nltk.sentiment import SentimentIntensityAnalyzer
	import nltk
	from collections import Counter
	import uvicorn
	import os
	import torch

	# Download NLTK resources
	try:
	nltk.download('vader_lexicon', quiet=True)
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)
	except:
	print("Could not download NLTK resources. Some features may be limited.")

	app = FastAPI()

	# Configure CORS
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # Allows all origins
	allow_credentials=True,
	allow_methods=["*"], # Allows all methods
	allow_headers=["*"], # Allows all headers
	)

	# Global variable for the pipeline
	humanize_pipe = None

	# Load NLP models
	try:
	# Load spaCy model
	nlp = spacy.load("en_core_web_sm")

	# Initialize sentiment analyzer
	sentiment_analyzer = SentimentIntensityAnalyzer()

	print("NLP models loaded successfully!")
	except Exception as e:
	print(f"Error loading NLP models: {e}")
	# Create fallback functions if models fail to load
	def mock_function(text):
	return "Model could not be loaded. This is a fallback response."

	def get_humanize_pipeline():
	"""
	Lazy-load the humanization pipeline on first use.
	Uses standard settings that don't require accelerate.
	"""
	global humanize_pipe
	if humanize_pipe is None:
	try:
	print("Loading the humanizer model on CPU...")

	# Force CPU usage
	device = torch.device("cpu")

	# Load model with basic settings (no accelerate needed)
	model = AutoModelForSeq2SeqLM.from_pretrained(
	"danibor/flan-t5-base-humanizer",
	torch_dtype=torch.float32 # Use float32 instead of float16 for CPU
	)
	tokenizer = AutoTokenizer.from_pretrained("danibor/flan-t5-base-humanizer")

	# Create pipeline with basic settings
	humanize_pipe = pipeline(
	"text2text-generation",
	model=model,
	tokenizer=tokenizer,
	device=device # Explicitly specify CPU
	)

	print("Humanizer model loaded successfully!")
	return humanize_pipe
	except Exception as e:
	print(f"Error loading humanizer model: {e}")
	# Create a simple pipeline-like function that just returns the input
	def simple_pipeline(text, **kwargs):
	return [{"generated_text": f"Could not process: {text} (Model failed to load)"}]
	humanize_pipe = simple_pipeline
	return humanize_pipe

	return humanize_pipe

	# Define request models
	class TextRequest(BaseModel):
	text: str

	class HumanizeResponse(BaseModel):
	original_text: str
	humanized_text: str
	diff: list
	original_word_count: int
	humanized_word_count: int
	nlp_analysis: dict

	class AnalyzeResponse(BaseModel):
	text: str
	word_count: int
	sentiment: dict
	entities: dict
	key_phrases: list
	readability: dict
	complexity: dict

	@app.post("/humanize", response_model=HumanizeResponse)
	async def humanize_text(request: TextRequest):
	input_text = request.text

	try:
	# Get or initialize the pipeline
	pipeline = get_humanize_pipeline()

	# Generate humanized text with basic settings
	result = pipeline(
	input_text,
	max_length=min(500, len(input_text) * 2), # Limit max length
	do_sample=True
	)

	humanized_text = result[0]['generated_text']

	# Get the differences
	diff = get_diff(input_text, humanized_text)

	# Process both texts with NLP
	nlp_analysis = perform_nlp_analysis(input_text, humanized_text)

	return {
	'original_text': input_text,
	'humanized_text': humanized_text,
	'diff': diff,
	'original_word_count': len(input_text.split()),
	'humanized_word_count': len(humanized_text.split()),
	'nlp_analysis': nlp_analysis
	}
	except Exception as e:
	print(f"Error in humanize endpoint: {str(e)}")
	raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}")

	def get_diff(text1, text2):
	"""
	Generate a list of changes between two texts.
	Returns a list of tuples (operation, text)
	where operation is '+' for addition, '-' for deletion, or ' ' for unchanged.
	"""
	d = difflib.Differ()
	diff = list(d.compare(text1.split(), text2.split()))

	result = []
	for item in diff:
	operation = item[0]
	if operation in ['+', '-', ' ']:
	text = item[2:]
	result.append({'operation': operation, 'text': text})

	return result

	def perform_nlp_analysis(original_text, humanized_text):
	"""
	Perform comprehensive NLP analysis on both original and humanized text.
	"""
	result = {}

	# Process both texts with spaCy
	original_doc = nlp(original_text)
	humanized_doc = nlp(humanized_text)

	# Sentiment analysis
	original_sentiment = sentiment_analyzer.polarity_scores(original_text)
	humanized_sentiment = sentiment_analyzer.polarity_scores(humanized_text)

	# Extract named entities
	original_entities = extract_entities(original_doc)
	humanized_entities = extract_entities(humanized_doc)

	# Extract key phrases using noun chunks
	original_phrases = extract_key_phrases(original_doc)
	humanized_phrases = extract_key_phrases(humanized_doc)

	# Readability metrics
	original_readability = calculate_readability(original_text)
	humanized_readability = calculate_readability(humanized_text)

	# Complexity metrics
	original_complexity = analyze_complexity(original_doc)
	humanized_complexity = analyze_complexity(humanized_doc)

	# Compile all results
	result = {
	'original': {
	'sentiment': original_sentiment,
	'entities': original_entities,
	'key_phrases': original_phrases,
	'readability': original_readability,
	'complexity': original_complexity
	},
	'humanized': {
	'sentiment': humanized_sentiment,
	'entities': humanized_entities,
	'key_phrases': humanized_phrases,
	'readability': humanized_readability,
	'complexity': humanized_complexity
	}
	}

	return result

	def extract_entities(doc):
	"""Extract and categorize named entities from a spaCy document."""
	entities = {}
	for ent in doc.ents:
	if ent.label_ not in entities:
	entities[ent.label_] = []
	if ent.text not in entities[ent.label_]:
	entities[ent.label_].append(ent.text)
	return entities

	def extract_key_phrases(doc):
	"""Extract key phrases using noun chunks."""
	return [chunk.text for chunk in doc.noun_chunks][:10] # Limit to top 10

	def calculate_readability(text):
	"""Calculate basic readability metrics."""
	# Count sentences
	sentences = len(list(nltk.sent_tokenize(text)))
	if sentences == 0:
	sentences = 1 # Avoid division by zero

	# Count words
	words = len(text.split())
	if words == 0:
	words = 1 # Avoid division by zero

	# Average words per sentence
	avg_words_per_sentence = words / sentences

	# Count syllables (simplified approach)
	syllables = count_syllables(text)

	# Calculate Flesch Reading Ease
	flesch = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)

	return {
	'sentence_count': sentences,
	'word_count': words,
	'avg_words_per_sentence': round(avg_words_per_sentence, 2),
	'syllable_count': syllables,
	'flesch_reading_ease': round(flesch, 2)
	}

	def count_syllables(text):
	"""Count syllables in text (simplified approach)."""
	# This is a simplified syllable counter
	text = text.lower()
	text = re.sub(r'[^a-zA-Z]', ' ', text)
	words = text.split()

	count = 0
	for word in words:
	word = word.strip()
	if not word:
	continue

	# Count vowel groups as syllables
	if word[-1] == 'e':
	word = word[:-1]

	vowel_count = len(re.findall(r'[aeiouy]+', word))
	if vowel_count == 0:
	vowel_count = 1

	count += vowel_count

	return count

	def analyze_complexity(doc):
	"""Analyze text complexity using POS tags and dependency parsing."""
	# Count POS tags
	pos_counts = Counter([token.pos_ for token in doc])

	# Calculate lexical diversity
	total_tokens = len(doc)
	unique_tokens = len(set([token.text.lower() for token in doc]))

	lexical_diversity = unique_tokens / total_tokens if total_tokens > 0 else 0

	# Count dependency relationship types
	dep_counts = Counter([token.dep_ for token in doc])

	return {
	'pos_distribution': dict(pos_counts),
	'lexical_diversity': round(lexical_diversity, 4),
	'dependency_types': dict(dep_counts)
	}

	@app.post("/analyze", response_model=AnalyzeResponse)
	async def analyze_text(request: TextRequest):
	"""Endpoint to just analyze text without humanizing it."""
	input_text = request.text

	try:
	# Process text with NLP
	doc = nlp(input_text)

	# Analyze text
	sentiment = sentiment_analyzer.polarity_scores(input_text)
	entities = extract_entities(doc)
	key_phrases = extract_key_phrases(doc)
	readability = calculate_readability(input_text)
	complexity = analyze_complexity(doc)

	return {
	'text': input_text,
	'word_count': len(input_text.split()),
	'sentiment': sentiment,
	'entities': entities,
	'key_phrases': key_phrases,
	'readability': readability,
	'complexity': complexity
	}
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error analyzing text: {str(e)}")

	# Add a root endpoint for Hugging Face Spaces health check
	@app.get("/")
	async def root():
	return {"message": "Text Analysis and Humanization API is running!"}

	# For local development
	if __name__ == "__main__":
	uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)