ashabot / app.py
roshcheeku's picture
Create app.py
98e78d3 verified
import os
import base64
import json
from flask import Flask, request, jsonify
from flask_cors import CORS
import spacy
from textblob import TextBlob
import re
import tempfile
import PyPDF2
import docx
import pyttsx3
import threading
import logging
from werkzeug.utils import secure_filename
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Initialize Flask app
app = Flask(__name__)
CORS(app) # Enable CORS for all routes
# Configure environment
UPLOAD_FOLDER = os.path.join(os.getcwd(), 'uploads')
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 20 * 1024 * 1024 # 20MB max upload
# Set up Hugging Face model parameters
HF_MODEL = os.environ.get('HF_MODEL', "mistralai/Mistral-7B-Instruct-v0.2")
logger.info(f"Using Hugging Face model: {HF_MODEL}")
# Dictionary to store chat sessions
chat_sessions = {}
# Load spaCy model
try:
nlp = spacy.load("en_core_web_sm")
logger.info("Successfully loaded spaCy model")
except Exception as e:
logger.error(f"Failed to load spaCy model: {str(e)}")
# Fallback to a simpler model if available
try:
nlp = spacy.load("en_core_web_md")
logger.info("Loaded fallback spaCy model")
except:
logger.error("Could not load any spaCy model")
# Define empty nlp function as fallback
def nlp(text):
class MockDoc:
def __init__(self, text):
self.text = text
self.noun_chunks = []
return MockDoc(text)
# Initialize text-to-speech engine
try:
engine = pyttsx3.init()
logger.info("Text-to-speech engine initialized")
except Exception as e:
logger.error(f"Failed to initialize text-to-speech: {str(e)}")
engine = None
# Load Hugging Face model and tokenizer
def load_hf_model():
try:
logger.info(f"Loading model: {HF_MODEL}")
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
model = AutoModelForCausalLM.from_pretrained(HF_MODEL)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
logger.info("Successfully loaded model and tokenizer")
return generator
except Exception as e:
logger.error(f"Error loading model: {str(e)}")
return None
# Load model on startup
generator = load_hf_model()
# Bias detection patterns and empowering messages
bias_patterns = {
"suitability for leadership": "Absolutely! Women have led globally—in government, business, and science.",
"emotional stability": "Emotional intelligence is a leadership asset for everyone.",
"tech ability": "Women are innovators in tech—from Ada Lovelace to today's pioneers.",
"logical thinking": "Logic is a human ability, not gender-specific.",
"career vs family": "Many women successfully balance career and family. Stereotypes don't define reality.",
"aggressiveness in women": "Assertiveness is a leadership strength for all genders.",
"women in STEM": "Women have been crucial in STEM fields, past and present.",
"women in politics": "Women have led nations and made major political impacts globally.",
"women's emotional nature": "Emotions are part of being human and a leadership strength.",
"women's competence in business": "Women are highly competent business leaders and entrepreneurs.",
"women's role in history": "Women have made monumental contributions across history."
}
# Suggestion for reframing biased questions
def suggest_reframing(pattern):
reframes = {
"suitability for leadership": "Ask about leadership qualities in all individuals.",
"emotional stability": "Focus on emotional intelligence across all leaders.",
"tech ability": "Highlight tech expertise without linking to gender.",
"logical thinking": "Emphasize logical thinking as a universal human trait.",
"career vs family": "Discuss career and family balance inclusively.",
"aggressiveness in women": "Celebrate assertiveness for all genders.",
"women in STEM": "Celebrate contributions of everyone in STEM.",
"women in politics": "Recognize political leadership without assumptions.",
"women's emotional nature": "Focus on emotional intelligence as a human strength.",
"women's competence in business": "Highlight business leadership across all people.",
"women's role in history": "Explore contributions from all genders."
}
return reframes.get(pattern, "Consider rephrasing to be more inclusive.")
# Sentiment analysis
def analyze_sentiment(text):
blob = TextBlob(text)
polarity = blob.sentiment.polarity
if polarity > 0.1:
return "positive"
elif polarity < -0.1:
return "negative"
else:
return "neutral"
# Bias detection with suggestion
def detect_gender_bias(text):
doc = nlp(text.lower())
for chunk in doc.noun_chunks:
if "women" in chunk.text:
for pattern in bias_patterns:
if re.search(r'\b' + r'\b|\b'.join(pattern.split()) + r'\b', text.lower()):
suggestion = suggest_reframing(pattern)
return (
f"{bias_patterns[pattern]}\n\n"
"🛠️ Suggestion: " + suggestion
)
return None
# File handling functions
def extract_text_from_pdf(file_path):
"""Extract text from PDF files"""
try:
text = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
except Exception as e:
logger.error(f"Error reading PDF: {str(e)}")
return f"Error reading PDF: {str(e)}"
def extract_text_from_docx(file_path):
"""Extract text from DOCX files"""
try:
doc = docx.Document(file_path)
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
return text
except Exception as e:
logger.error(f"Error reading DOCX: {str(e)}")
return f"Error reading DOCX: {str(e)}"
def process_file(file_path, file_type):
"""Process different file types and extract text"""
if not os.path.exists(file_path):
return f"File not found: {file_path}"
file_extension = file_type.lower()
if 'pdf' in file_extension:
return extract_text_from_pdf(file_path)
elif file_extension in ['doc', 'docx']:
return extract_text_from_docx(file_path)
elif file_extension in ['txt', 'text']:
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
logger.error(f"Error reading text file: {str(e)}")
return f"Error reading text file: {str(e)}"
elif file_extension in ['xls', 'xlsx']:
# Return placeholder for Excel files - consider integrating pandas for actual processing
return "Excel file detected. Specific content analysis currently limited."
elif file_extension in ['jpg', 'jpeg', 'png']:
# Placeholder for image files - consider adding OCR
return "Image file detected. OCR processing would occur here."
else:
return f"Processing for {file_extension} files is not supported."
def save_base64_file(base64_string, filename, file_type):
"""Save a base64 encoded file to disk"""
try:
file_data = base64.b64decode(base64_string)
file_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename))
with open(file_path, 'wb') as f:
f.write(file_data)
return file_path
except Exception as e:
logger.error(f"Error saving file: {str(e)}")
return None
def get_or_create_chat_session(session_id):
"""Create a new chat session or return an existing one"""
if session_id not in chat_sessions:
logger.info(f"Creating new chat session: {session_id}")
# Initialize with session history
chat_sessions[session_id] = {
"history": [
{
"role": "user",
"content": "You are Ashabot, an ethical AI chatbot. Always respond respectfully and avoid engaging in gender-biased or discriminatory content. "
"If such content is detected, respond with educational, inclusive, and fact-based replies. "
"You can understand document content and respond to various file types including PDFs, documents, and images."
},
{
"role": "assistant",
"content": "I am Ashabot, an ethical AI chatbot. I'm here to assist you with information and responses that are respectful and inclusive. "
"I can help analyze document content and respond to various file types. How can I assist you today?"
}
]
}
return chat_sessions[session_id]
def generate_suggestions(response_text):
"""Generate follow-up suggestions based on the response"""
suggestions = []
# Simple heuristic for generating follow-up questions
if "leadership" in response_text.lower():
suggestions.append("Tell me more about leadership qualities")
if "STEM" in response_text or "science" in response_text.lower():
suggestions.append("How can we encourage more diversity in STEM?")
if "career" in response_text.lower():
suggestions.append("What career opportunities align with my skills?")
# Add generic suggestions if we don't have specific ones
if len(suggestions) < 2:
suggestions.extend([
"How can I learn more about this topic?",
"Could you provide some resources on this subject?"
])
return suggestions[:2] # Return at most 2 suggestions
def generate_opportunities(text, opportunities_data=None):
"""Generate potential opportunities based on user input and profile data"""
opportunities = []
if opportunities_data:
skills = opportunities_data.get('skills', [])
interests = opportunities_data.get('interests', [])
# Simple matching algorithm - in production this would be more sophisticated
if any(skill.lower() in text.lower() for skill in skills):
opportunities.append({
"title": "Skill Development Opportunity",
"description": "Based on your skills, consider enhancing your expertise in this area.",
"url": "https://example.com/skill-development"
})
if any(interest.lower() in text.lower() for interest in interests):
opportunities.append({
"title": "Interest-Based Opportunity",
"description": "This aligns with your interests. Explore more in this field.",
"url": "https://example.com/explore-interests"
})
# Add a generic opportunity if we don't have specific matches
if not opportunities:
opportunities.append({
"title": "Learning Resource",
"description": "Explore more about this topic through our learning platform",
"url": "https://example.com/learn-more"
})
return opportunities
def generate_response_with_hf(prompt, chat_history=None):
"""Generate response using Hugging Face model"""
if generator is None:
return "Model not available. Please check server logs."
try:
# Prepare conversation history for the model
formatted_prompt = ""
if chat_history:
for message in chat_history:
role = message.get("role", "")
content = message.get("content", "")
if role == "user":
formatted_prompt += f"User: {content}\n"
elif role == "assistant":
formatted_prompt += f"Assistant: {content}\n"
# Add current prompt
formatted_prompt += f"User: {prompt}\nAssistant:"
# Generate response
response = generator(
formatted_prompt,
max_length=1024,
num_return_sequences=1,
temperature=0.7,
top_p=0.9,
do_sample=True
)
# Extract and clean the response
generated_text = response[0]['generated_text']
assistant_response = generated_text.split("Assistant:")[-1].strip()
# Handle potential empty responses
if not assistant_response:
assistant_response = "I apologize, but I couldn't generate a response. Please try rephrasing your question."
return assistant_response
except Exception as e:
logger.error(f"Error generating response: {str(e)}")
return f"An error occurred while generating a response: {str(e)}"
@app.route('/api/chat', methods=['POST'])
def chat():
"""Main endpoint for chat functionality"""
try:
# Parse request data
data = request.json
session_id = data.get('session_id')
user_message = data.get('message', '')
has_files = data.get('has_files', False)
files = data.get('files', [])
opportunities_data = data.get('opportunities_data', {})
if not session_id:
return jsonify({'error': 'Session ID is required'}), 400
logger.info(f"Received request for session {session_id}, has_files: {has_files}")
# Get chat session
chat_session = get_or_create_chat_session(session_id)
# Analyze sentiment
sentiment = analyze_sentiment(user_message)
logger.info(f"Message sentiment: {sentiment}")
# Check for gender bias
bias_warning = detect_gender_bias(user_message)
if bias_warning:
logger.info("Gender bias detected")
response_text = f"I noticed some gender bias in your message. {bias_warning}\n\nLet's continue the conversation inclusively! 🌟"
# Add messages to history
chat_session["history"].append({"role": "user", "content": user_message})
chat_session["history"].append({"role": "assistant", "content": response_text})
return jsonify({
'response': response_text,
'suggestions': generate_suggestions(response_text),
'opportunities': []
})
# Process files if present
file_contents = []
if has_files and files:
for file_info in files:
file_name = file_info.get('file_name')
file_data = file_info.get('file_data')
file_type = file_info.get('file_type')
if file_name and file_data:
# Save file to disk
file_path = save_base64_file(file_data, file_name, file_type)
if file_path:
# Process file based on type
file_content = process_file(file_path, file_type)
if not file_content.startswith("Error") and not file_content.startswith("Processing for"):
file_contents.append(f"Content from {file_name}: {file_content[:5000]}") # Limit to 5000 chars per file
# Add message about successfully processed file
logger.info(f"Successfully processed file: {file_name}")
else:
logger.warning(f"Issue processing file: {file_content}")
else:
logger.error(f"Failed to save file: {file_name}")
# Construct complete message with both user text and file contents
complete_message = user_message
if file_contents:
complete_message += "\n\nAttached files content:\n" + "\n\n".join(file_contents)
# Add user message to history
chat_session["history"].append({"role": "user", "content": complete_message})
# Generate response with HF model
try:
response_text = generate_response_with_hf(complete_message, chat_session["history"])
# Add assistant response to history
chat_session["history"].append({"role": "assistant", "content": response_text})
# Keep history at a reasonable size (last 10 messages)
if len(chat_session["history"]) > 12: # Initial system messages + 10 user/assistant exchanges
chat_session["history"] = chat_session["history"][:2] + chat_session["history"][-10:]
# Generate suggestions based on response
suggestions = generate_suggestions(response_text)
# Generate opportunities based on user message and profile
opportunities = generate_opportunities(complete_message, opportunities_data)
return jsonify({
'response': response_text,
'suggestions': suggestions,
'opportunities': opportunities
})
except Exception as e:
logger.error(f"Error generating response: {str(e)}")
return jsonify({
'error': f"Error generating response: {str(e)}",
'suggestions': ["Could you try rephrasing your question?", "Let's try a different topic"],
'opportunities': []
}), 500
except Exception as e:
logger.error(f"Error processing request: {str(e)}")
return jsonify({'error': str(e)}), 500
@app.route('/api/health', methods=['GET'])
def health_check():
"""Health check endpoint"""
return jsonify({
'status': 'ok',
'service': 'Ashabot API',
'model': HF_MODEL
})
@app.route('/', methods=['GET'])
def index():
"""Root endpoint with API documentation"""
return jsonify({
'service': 'Ashabot API',
'version': '1.0.0',
'model': HF_MODEL,
'endpoints': {
'/api/chat': 'POST - Send messages and files for processing',
'/api/health': 'GET - Health check'
},
'documentation': 'See README.md for full API documentation'
})
if __name__ == '__main__':
port = int(os.environ.get('PORT', 5000))
app.run(host='0.0.0.0', port=port, debug=False) # Set debug=False for production