Spaces:

PedroM2626
/

Watsonx_AI-Intelligent_Document_Analysis

Sleeping

App Files Files Community

Watsonx_AI-Intelligent_Document_Analysis / app.py

PedroM2626

refactor: translate Portuguese codebase to English for internationalization

8f65225 16 days ago

raw

history blame contribute delete

13.9 kB

	import gradio as gr
	from ibm_watson import NaturalLanguageUnderstandingV1
	from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
	from docx import Document
	from PyPDF2 import PdfReader
	import os
	from dotenv import load_dotenv
	import json
	import re
	import unicodedata
	import requests

	def normalize_text(text):
	"""Removes accents, special characters and converts to lowercase."""
	if not text:
	return ""
	# Convert to lowercase and remove extra spaces
	text = text.lower().strip()
	# Remove accents
	text = "".join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
	# Remove basic punctuation for search (keep letters and numbers)
	text = re.sub(r'[^a-z0-9\s]', '', text)
	return text

	# Load environment variables
	load_dotenv()

	# Initialize Natural Language Understanding
	API_KEY = os.getenv('IBM_WATSON_API_KEY', 'YOUR_API_KEY')
	SERVICE_URL = os.getenv('IBM_WATSON_URL', 'YOUR_SERVICE_URL')
	PROJECT_ID = os.getenv('IBM_WATSONX_PROJECT_ID', 'YOUR_PROJECT_ID')
	WATSONX_API_KEY = os.getenv('IBM_WATSONX_API_KEY', API_KEY) # Use specific key or general as fallback

	authenticator = IAMAuthenticator(API_KEY)
	nlu = NaturalLanguageUnderstandingV1(
	version='2024-05-10',
	authenticator=authenticator
	)
	nlu.set_service_url(SERVICE_URL)

	# Function to extract text from a document
	def extract_text(file):
	if not file:
	return "No file uploaded."

	try:
	# If file is a gr.File object, it has the .name attribute (temporary path)
	file_name = file.name if hasattr(file, 'name') else file

	if file_name.endswith('.pdf'):
	reader = PdfReader(file_name)
	text = ''
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text
	return text
	elif file_name.endswith('.docx'):
	doc = Document(file_name)
	text = ''
	for para in doc.paragraphs:
	text += para.text + '\n'
	return text
	elif file_name.endswith('.txt'):
	with open(file_name, 'r', encoding='utf-8') as f:
	return f.read()
	else:
	return "Unsupported file format. Use PDF, DOCX or TXT."
	except Exception as e:
	return f"Error extracting text: {str(e)}"

	# Function to process text (Summary, Keywords, Classification)
	def process_text(text):
	if not text or len(text.strip()) < 10:
	return "Insufficient text for processing.", "", ""

	try:
	# Try automatic summarization (may not be available in all plans/regions)
	try:
	summary_res = nlu.analyze(
	text=text,
	features={'summarization': {'limit': 1}}
	).get_result()
	summary = summary_res.get('summarization', {}).get('text', 'Summary not available.')
	except Exception:
	summary = "Automatic summarization not available in your Watson NLU plan. Showing main concepts..."

	# Key topics extraction (keywords)
	topics_res = nlu.analyze(
	text=text,
	features={'keywords': {'limit': 10}}
	).get_result()
	topics_list = [k['text'] for k in topics_res.get('keywords', [])]
	topics = ", ".join(topics_list[:5])

	# If summary failed, we try to use topics to create a simple description
	if "not available" in summary:
	summary = f"The document covers topics such as: {', '.join(topics_list[:3])}."

	# Thematic classification (categories)
	classification_res = nlu.analyze(
	text=text,
	features={'categories': {'limit': 5}}
	).get_result()
	classification = ", ".join([c['label'] for c in classification_res.get('categories', [])])

	return summary, topics, classification
	except Exception as e:
	return f"Processing error: {str(e)}", "", ""

	# Function to answer questions about the document (Search)
	def answer_question(question, text):
	if not question or not text:
	return "Please provide a question and ensure the document has been analyzed first."

	try:
	# 1. Extraction of important terms from the question using NLU (Keywords and Concepts)
	search_terms = []
	try:
	question_analysis = nlu.analyze(
	text=question,
	features={'keywords': {}, 'concepts': {}}
	).get_result()

	for k in question_analysis.get('keywords', []):
	search_terms.append(normalize_text(k['text']))
	for c in question_analysis.get('concepts', []):
	search_terms.append(normalize_text(c['text']))
	except:
	pass # Fallback to manual extraction if NLU fails on short question

	# If Watson doesn't return terms or fails, use manual split with normalization
	if not search_terms:
	search_terms = normalize_text(question).split()

	if not search_terms:
	# Last attempt: if everything fails, use the entire normalized question
	search_terms = [normalize_text(question)]

	# 2. Document text processing
	# Normalize full text for search
	normalized_text = normalize_text(text)

	# Split document into smaller blocks (paragraphs)
	raw_blocks = re.split(r'\n\s*\n', text)
	if len(raw_blocks) < 2:
	raw_blocks = text.split('\n')

	valid_paragraphs = []
	for block in raw_blocks:
	clean = block.strip()
	if len(clean) > 20: # Keep blocks with minimum content
	valid_paragraphs.append({
	'original': clean,
	'normalized': normalize_text(clean)
	})

	# If still few blocks, try to split by sentences
	if len(valid_paragraphs) < 3:
	sentences = re.split(r'\.\s+', text)
	valid_paragraphs = []
	for s in sentences:
	clean = s.strip()
	if len(clean) > 20:
	valid_paragraphs.append({
	'original': clean,
	'normalized': normalize_text(clean)
	})

	# 3. Relevance calculation (Ranking)
	best_paragraph = ""
	highest_score = 0

	for item in valid_paragraphs:
	p_norm = item['normalized']
	score = 0

	for term in search_terms:
	if not term: continue
	# If exact term (normalized) is in paragraph
	if term in p_norm:
	score += 1
	# Whole word bonus to avoid false-positives in substrings
	if re.search(rf'\b{re.escape(term)}\b', p_norm):
	score += 2

	# If score is equal, we prefer shorter (more specific) paragraph
	if score > highest_score:
	highest_score = score
	best_paragraph = item['original']
	elif score == highest_score and score > 0:
	if len(item['original']) < len(best_paragraph):
	best_paragraph = item['original']

	# 4. Result return
	if best_paragraph and highest_score > 0:
	return f"Based on the document, I found this relevant snippet:\n\n\"{best_paragraph}\""
	else:
	return "Unfortunately I didn't find a direct answer in the document. Try rephrasing your question with other terms."

	except Exception as e:
	return f"Error processing smart search: {str(e)}"

	# --- Smart Chat Functions (RAG with Watsonx AI) ---

	def get_iam_token():
	"""Generates an IAM access token using the Watsonx API Key."""
	url = "https://iam.cloud.ibm.com/identity/token"
	headers = {"Content-Type": "application/x-www-form-urlencoded"}
	data = f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={WATSONX_API_KEY}"

	try:
	response = requests.post(url, headers=headers, data=data)
	if response.status_code == 200:
	return response.json().get("access_token")
	elif response.status_code == 400:
	return f"Authentication Error (400): The provided API Key is invalid or not found. Check your .env file."
	else:
	return f"Error generating token ({response.status_code}): {response.text}"
	except Exception as e:
	return f"Connection error generating token: {str(e)}"

	def smart_chat(question, document_text):
	"""Performs a smart chat (RAG) using the Llama-3 model on Watsonx AI."""
	if not question or not document_text:
	return "Please analyze a document first and type a question."

	token = get_iam_token()
	if token.startswith("Error"):
	return token

	url = "https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29"

	# Limit document text to not exceed model token limit
	context = document_text[:10000] # Approximately 2500 tokens

	body = {
	"messages": [
	{
	"role": "system",
	"content": (
	"You are a helpful and honest AI assistant. "
	"Your task is to answer questions based EXCLUSIVELY on the content of the document provided below. "
	"If the answer is not in the text, say you didn't find the information in the document. "
	"Always answer in English and use Markdown formatting.\n\n"
	f"DOCUMENT CONTENT:\n{context}"
	)
	},
	{
	"role": "user",
	"content": question
	}
	],
	"project_id": PROJECT_ID,
	"model_id": "meta-llama/llama-3-3-70b-instruct",
	"frequency_penalty": 0,
	"max_tokens": 2000,
	"presence_penalty": 0,
	"temperature": 0,
	"top_p": 1
	}

	headers = {
	"Accept": "application/json",
	"Content-Type": "application/json",
	"Authorization": f"Bearer {token}"
	}

	try:
	response = requests.post(url, headers=headers, json=body)
	if response.status_code != 200:
	return f"Watsonx API Error: {response.text}"

	data = response.json()
	return data['choices'][0]['message']['content']
	except Exception as e:
	return f"Chat processing error: {str(e)}"

	# --- Gradio Interface using Blocks ---
	def create_interface():
	with gr.Blocks(title="Intelligent Document Analysis") as demo:
	gr.Markdown("# 📑 Watsonx AI - Intelligent Document Analysis")
	gr.Markdown("Extract information, summaries and ask questions about your PDF, DOCX or TXT documents.")

	with gr.Tab("1. Extraction and Analysis"):
	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Document Upload")
	analyze_button = gr.Button("Analyze Document", variant="primary")

	with gr.Column():
	extracted_text = gr.Textbox(label="Extracted Text", lines=10, interactive=False)

	with gr.Row():
	summary_output = gr.Textbox(label="Automatic Summary")
	topics_output = gr.Textbox(label="Key Topics")
	classification_output = gr.Textbox(label="Thematic Classification")

	with gr.Tab("2. Snippet Locator (Semantic Search)"):
	gr.Markdown("### 🔍 Find specific snippets in the document")
	gr.Markdown("This tool locates the most relevant paragraphs containing your search terms.")
	with gr.Row():
	question_input = gr.Textbox(label="What are you looking for in the text?", placeholder="Ex: Revenue goals")
	question_button = gr.Button("Locate Snippet", variant="secondary")

	answer_output = gr.Textbox(label="Most relevant snippet found", lines=10)

	with gr.Tab("3. Smart Chat (RAG)"):
	gr.Markdown("### 🤖 Ask the Artificial Intelligence")
	gr.Markdown("The Llama-3 model will analyze the entire document to answer your questions with reasoning and synthesis.")
	with gr.Row():
	chat_input = gr.Textbox(label="Your Question for IA", placeholder="Ex: What is the main theme of the document?")
	chat_button = gr.Button("Generate IA Response", variant="primary")

	chat_output = gr.Markdown()

	# Event definitions
	def run_analysis_flow(file):
	text = extract_text(file)
	summary, topics, classification = process_text(text)
	return text, summary, topics, classification

	analyze_button.click(
	fn=run_analysis_flow,
	inputs=[file_input],
	outputs=[extracted_text, summary_output, topics_output, classification_output]
	)

	question_button.click(
	fn=answer_question,
	inputs=[question_input, extracted_text],
	outputs=[answer_output]
	)

	chat_button.click(
	fn=smart_chat,
	inputs=[chat_input, extracted_text],
	outputs=[chat_output]
	)

	return demo

	if __name__ == "__main__":
	app = create_interface()
	app.launch()