Spaces:

PedroM2626
/

Watsonx_AI-Intelligent_Document_Analysis

Sleeping

App Files Files Community

PedroM2626 commited on Feb 1

Commit

8f65225

1 Parent(s): c1b16e4

refactor: translate Portuguese codebase to English for internationalization

Browse files

Files changed (1) hide show

app.py +187 -187

app.py CHANGED Viewed

@@ -10,26 +10,26 @@ import re
 import unicodedata
 import requests
-def normalizar_texto(texto):
-    """Remove acentos, caracteres especiais e converte para minúsculas."""
-    if not texto:
         return ""
-    # Converte para minúsculas e remove espaços extras
-    texto = texto.lower().strip()
-    # Remove acentos
-    texto = "".join(c for c in unicodedata.normalize('NFD', texto) if unicodedata.category(c) != 'Mn')
-    # Remove pontuação básica para busca (mantém letras e números)
-    texto = re.sub(r'[^a-z0-9\s]', '', texto)
-    return texto
-# Carregar variáveis de ambiente
 load_dotenv()
-# Inicializar o Natural Language Understanding
-API_KEY = os.getenv('IBM_WATSON_API_KEY', 'SUA_CHAVE_API')
-SERVICE_URL = os.getenv('IBM_WATSON_URL', 'SUA_URL_SERVICO')
-PROJECT_ID = os.getenv('IBM_WATSONX_PROJECT_ID', 'SEU_PROJECT_ID')
-WATSONX_API_KEY = os.getenv('IBM_WATSONX_API_KEY', API_KEY) # Usa a chave específica ou a geral como fallback
 authenticator = IAMAuthenticator(API_KEY)
 nlu = NaturalLanguageUnderstandingV1(
@@ -38,173 +38,173 @@ nlu = NaturalLanguageUnderstandingV1(
 )
 nlu.set_service_url(SERVICE_URL)
-# Função para extrair texto de um documento
-def extrair_texto(arquivo):
-    if not arquivo:
-        return "Nenhum arquivo enviado."
     try:
-        # Se arquivo for um objeto gr.File, ele tem o atributo .name (caminho temporário)
-        nome_arquivo = arquivo.name if hasattr(arquivo, 'name') else arquivo
-        if nome_arquivo.endswith('.pdf'):
-            reader = PdfReader(nome_arquivo)
-            texto = ''
             for page in reader.pages:
                 page_text = page.extract_text()
                 if page_text:
-                    texto += page_text
-            return texto
-        elif nome_arquivo.endswith('.docx'):
-            doc = Document(nome_arquivo)
-            texto = ''
             for para in doc.paragraphs:
-                texto += para.text + '\n'
-            return texto
-        elif nome_arquivo.endswith('.txt'):
-            with open(nome_arquivo, 'r', encoding='utf-8') as f:
                 return f.read()
         else:
-            return "Formato de arquivo não suportado. Use PDF, DOCX ou TXT."
     except Exception as e:
-        return f"Erro ao extrair texto: {str(e)}"
-# Função para processar o texto (Resumo, Tópicos, Classificação)
-def processar_texto(texto):
-    if not texto or len(texto.strip()) < 10:
-        return "Texto insuficiente para processamento.", "", ""
     try:
-        # Tenta o resumo automático (pode não estar disponível em todos os planos/regiões)
         try:
-            resumo_res = nlu.analyze(
-                text=texto,
                 features={'summarization': {'limit': 1}}
             ).get_result()
-            resumo = resumo_res.get('summarization', {}).get('text', 'Resumo não disponível.')
         except Exception:
-            resumo = "Resumo automático não disponível no seu plano Watson NLU. Exibindo principais conceitos..."
-        # Extração de tópicos-chave (keywords)
-        topicos_res = nlu.analyze(
-            text=texto,
             features={'keywords': {'limit': 10}}
         ).get_result()
-        topicos_lista = [k['text'] for k in topicos_res.get('keywords', [])]
-        topicos = ", ".join(topicos_lista[:5])
-        # Se o resumo falhou, tentamos usar os tópicos para criar uma descrição simples
-        if "não disponível" in resumo:
-            resumo = f"O documento aborda temas como: {', '.join(topicos_lista[:3])}."
-        # Classificação temática (categories)
-        classificacao_res = nlu.analyze(
-            text=texto,
             features={'categories': {'limit': 5}}
         ).get_result()
-        classificacao = ", ".join([c['label'] for c in classificacao_res.get('categories', [])])
-        return resumo, topicos, classificacao
     except Exception as e:
-        return f"Erro no processamento: {str(e)}", "", ""
-# Função para responder a perguntas sobre o documento (Q&A)
-def responder_pergunta(pergunta, texto):
-    if not pergunta or not texto:
-        return "Por favor, forneça uma pergunta e garanta que o documento foi analisado primeiro."
     try:
-        # 1. Extração de termos importantes da pergunta usando NLU (Keywords e Concepts)
-        termos_busca = []
         try:
-            analise_pergunta = nlu.analyze(
-                text=pergunta,
                 features={'keywords': {}, 'concepts': {}}
             ).get_result()
-            for k in analise_pergunta.get('keywords', []):
-                termos_busca.append(normalizar_texto(k['text']))
-            for c in analise_pergunta.get('concepts', []):
-                termos_busca.append(normalizar_texto(c['text']))
         except:
-            pass # Fallback para extração manual se o NLU falhar na pergunta curta
-        # Se o Watson não retornar termos ou falhar, usamos split manual com normalização
-        if not termos_busca:
-            termos_busca = normalizar_texto(pergunta).split()
-        if not termos_busca:
-            # Última tentativa: se tudo falhar, usa a pergunta normalizada inteira
-            termos_busca = [normalizar_texto(pergunta)]
-        # 2. Processamento do texto do documento
-        # Normalizamos o texto completo para a busca
-        texto_normalizado = normalizar_texto(texto)
-        # Dividimos o documento em blocos menores (parágrafos)
-        blocos_brutos = re.split(r'\n\s*\n', texto)
-        if len(blocos_brutos) < 2:
-            blocos_brutos = texto.split('\n')
-        paragrafos_validos = []
-        for bloco in blocos_brutos:
-            limpo = bloco.strip()
-            if len(limpo) > 20: # Mantém blocos com conteúdo mínimo
-                paragrafos_validos.append({
-                    'original': limpo,
-                    'normalizado': normalizar_texto(limpo)
                 })
-        # Se ainda houver poucos blocos, tentamos dividir por sentenças
-        if len(paragrafos_validos) < 3:
-            sentencas = re.split(r'\.\s+', texto)
-            paragrafos_validos = []
-            for s in sentencas:
-                limpo = s.strip()
-                if len(limpo) > 20:
-                    paragrafos_validos.append({
-                        'original': limpo,
-                        'normalizado': normalizar_texto(limpo)
                     })
-        # 3. Cálculo de relevância (Ranking)
-        melhor_paragrafo = ""
-        maior_score = 0
-        for item in paragrafos_validos:
-            p_norm = item['normalizado']
             score = 0
-            for termo in termos_busca:
-                if not termo: continue
-                # Se o termo exato (normalizado) está no parágrafo
-                if termo in p_norm:
                     score += 1
-                    # Bônus por palavra inteira para evitar falso-positivos em substrings
-                    if re.search(rf'\b{re.escape(termo)}\b', p_norm):
                         score += 2
-            # Se o score for igual, preferimos o parágrafo mais curto (mais específico)
-            if score > maior_score:
-                maior_score = score
-                melhor_paragrafo = item['original']
-            elif score == maior_score and score > 0:
-                if len(item['original']) < len(melhor_paragrafo):
-                    melhor_paragrafo = item['original']
-        # 4. Retorno do resultado
-        if melhor_paragrafo and maior_score > 0:
-            return f"Com base no documento, encontrei este trecho relevante:\n\n\"{melhor_paragrafo}\""
         else:
-            return "Infelizmente não encontrei uma resposta direta no documento. Tente reformular sua pergunta com outros termos."
     except Exception as e:
-        return f"Erro ao processar busca inteligente: {str(e)}"
-# --- Funções de Chat Inteligente (RAG com Watsonx AI) ---
-def obter_iam_token():
-    """Gera um token de acesso IAM usando a API Key do Watsonx."""
     url = "https://iam.cloud.ibm.com/identity/token"
     headers = {"Content-Type": "application/x-www-form-urlencoded"}
     data = f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={WATSONX_API_KEY}"
@@ -214,41 +214,41 @@ def obter_iam_token():
         if response.status_code == 200:
             return response.json().get("access_token")
         elif response.status_code == 400:
-            return f"Erro de Autenticação (400): A API Key fornecida é inválida ou não foi encontrada. Verifique seu arquivo .env."
         else:
-            return f"Erro ao gerar token ({response.status_code}): {response.text}"
     except Exception as e:
-        return f"Erro de conexão ao gerar token: {str(e)}"
-def chat_inteligente(pergunta, texto_documento):
-    """Realiza um chat inteligente (RAG) usando o modelo Llama-3 no Watsonx AI."""
-    if not pergunta or not texto_documento:
-        return "Por favor, analise um documento primeiro e digite uma pergunta."
-    token = obter_iam_token()
-    if token.startswith("Erro"):
         return token
     url = "https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29"
-    # Limitamos o texto do documento para não exceder o limite de tokens do modelo
-    contexto = texto_documento[:10000] # Aproximadamente 2500 tokens
     body = {
         "messages": [
             {
                 "role": "system",
                 "content": (
-                    "Você é um assistente de IA prestativo e honesto. "
-                    "Sua tarefa é responder perguntas baseando-se EXCLUSIVAMENTE no conteúdo do documento fornecido abaixo. "
-                    "Se a resposta não estiver no texto, diga que não encontrou a informação no documento. "
-                    "Responda sempre em português brasileiro e use formatação Markdown.\n\n"
-                    f"CONTEÚDO DO DOCUMENTO:\n{contexto}"
                 )
             },
             {
                 "role": "user",
-                "content": pergunta
             }
         ],
         "project_id": PROJECT_ID,
@@ -269,77 +269,77 @@ def chat_inteligente(pergunta, texto_documento):
     try:
         response = requests.post(url, headers=headers, json=body)
         if response.status_code != 200:
-            return f"Erro na API Watsonx: {response.text}"
         data = response.json()
         return data['choices'][0]['message']['content']
     except Exception as e:
-        return f"Erro no processamento do chat: {str(e)}"
-# --- Interface Gradio usando Blocks ---
-def criar_interface():
-    with gr.Blocks(title="Análise Inteligente de Documentos") as demo:
-        gr.Markdown("# 📑 Watsonx AI - Análise Inteligente de Documentos")
-        gr.Markdown("Extraia informações, resumos e faça perguntas sobre seus documentos PDF, DOCX ou TXT.")
-        with gr.Tab("1. Extração e Análise"):
             with gr.Row():
                 with gr.Column():
-                    arquivo_input = gr.File(label="Upload de Documento")
-                    botao_analisar = gr.Button("Analisar Documento", variant="primary")
                 with gr.Column():
-                    texto_extraido = gr.Textbox(label="Texto Extraído", lines=10, interactive=False)
             with gr.Row():
-                resumo_output = gr.Textbox(label="Resumo Automático")
-                topicos_output = gr.Textbox(label="Tópicos-Chave")
-                classificacao_output = gr.Textbox(label="Classificação Temática")
-        with gr.Tab("2. Localizador de Trechos (Busca Semântica)"):
-            gr.Markdown("### 🔍 Encontre trechos específicos no documento")
-            gr.Markdown("Esta ferramenta localiza os parágrafos mais relevantes que contêm os termos da sua pergunta.")
             with gr.Row():
-                pergunta_input = gr.Textbox(label="O que você procura no texto?", placeholder="Ex: Metas de faturamento")
-                botao_perguntar = gr.Button("Localizar Trecho", variant="secondary")
-            resposta_output = gr.Textbox(label="Trecho mais relevante encontrado", lines=10)
-        with gr.Tab("3. Chat Inteligente (RAG)"):
-            gr.Markdown("### 🤖 Pergunte à Inteligência Artificial")
-            gr.Markdown("O modelo Llama-3 analisará todo o documento para responder suas perguntas com raciocínio e síntese.")
             with gr.Row():
-                chat_input = gr.Textbox(label="Sua Pergunta para a IA", placeholder="Ex: Qual o tema principal do documento?")
-                botao_chat = gr.Button("Gerar Resposta com IA", variant="primary")
             chat_output = gr.Markdown()
-        # Definição dos eventos
-        def executar_fluxo_analise(arquivo):
-            texto = extrair_texto(arquivo)
-            resumo, topicos, classificacao = processar_texto(texto)
-            return texto, resumo, topicos, classificacao
-        botao_analisar.click(
-            fn=executar_fluxo_analise,
-            inputs=[arquivo_input],
-            outputs=[texto_extraido, resumo_output, topicos_output, classificacao_output]
         )
-        botao_perguntar.click(
-            fn=responder_pergunta,
-            inputs=[pergunta_input, texto_extraido],
-            outputs=[resposta_output]
         )
-        botao_chat.click(
-            fn=chat_inteligente,
-            inputs=[chat_input, texto_extraido],
             outputs=[chat_output]
         )
     return demo
 if __name__ == "__main__":
-    app = criar_interface()
     app.launch()

 import unicodedata
 import requests
+def normalize_text(text):
+    """Removes accents, special characters and converts to lowercase."""
+    if not text:
         return ""
+    # Convert to lowercase and remove extra spaces
+    text = text.lower().strip()
+    # Remove accents
+    text = "".join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
+    # Remove basic punctuation for search (keep letters and numbers)
+    text = re.sub(r'[^a-z0-9\s]', '', text)
+    return text
+# Load environment variables
 load_dotenv()
+# Initialize Natural Language Understanding
+API_KEY = os.getenv('IBM_WATSON_API_KEY', 'YOUR_API_KEY')
+SERVICE_URL = os.getenv('IBM_WATSON_URL', 'YOUR_SERVICE_URL')
+PROJECT_ID = os.getenv('IBM_WATSONX_PROJECT_ID', 'YOUR_PROJECT_ID')
+WATSONX_API_KEY = os.getenv('IBM_WATSONX_API_KEY', API_KEY) # Use specific key or general as fallback
 authenticator = IAMAuthenticator(API_KEY)
 nlu = NaturalLanguageUnderstandingV1(
 )
 nlu.set_service_url(SERVICE_URL)
+# Function to extract text from a document
+def extract_text(file):
+    if not file:
+        return "No file uploaded."
     try:
+        # If file is a gr.File object, it has the .name attribute (temporary path)
+        file_name = file.name if hasattr(file, 'name') else file
+        if file_name.endswith('.pdf'):
+            reader = PdfReader(file_name)
+            text = ''
             for page in reader.pages:
                 page_text = page.extract_text()
                 if page_text:
+                    text += page_text
+            return text
+        elif file_name.endswith('.docx'):
+            doc = Document(file_name)
+            text = ''
             for para in doc.paragraphs:
+                text += para.text + '\n'
+            return text
+        elif file_name.endswith('.txt'):
+            with open(file_name, 'r', encoding='utf-8') as f:
                 return f.read()
         else:
+            return "Unsupported file format. Use PDF, DOCX or TXT."
     except Exception as e:
+        return f"Error extracting text: {str(e)}"
+# Function to process text (Summary, Keywords, Classification)
+def process_text(text):
+    if not text or len(text.strip()) < 10:
+        return "Insufficient text for processing.", "", ""
     try:
+        # Try automatic summarization (may not be available in all plans/regions)
         try:
+            summary_res = nlu.analyze(
+                text=text,
                 features={'summarization': {'limit': 1}}
             ).get_result()
+            summary = summary_res.get('summarization', {}).get('text', 'Summary not available.')
         except Exception:
+            summary = "Automatic summarization not available in your Watson NLU plan. Showing main concepts..."
+        # Key topics extraction (keywords)
+        topics_res = nlu.analyze(
+            text=text,
             features={'keywords': {'limit': 10}}
         ).get_result()
+        topics_list = [k['text'] for k in topics_res.get('keywords', [])]
+        topics = ", ".join(topics_list[:5])
+        # If summary failed, we try to use topics to create a simple description
+        if "not available" in summary:
+            summary = f"The document covers topics such as: {', '.join(topics_list[:3])}."
+        # Thematic classification (categories)
+        classification_res = nlu.analyze(
+            text=text,
             features={'categories': {'limit': 5}}
         ).get_result()
+        classification = ", ".join([c['label'] for c in classification_res.get('categories', [])])
+        return summary, topics, classification
     except Exception as e:
+        return f"Processing error: {str(e)}", "", ""
+# Function to answer questions about the document (Search)
+def answer_question(question, text):
+    if not question or not text:
+        return "Please provide a question and ensure the document has been analyzed first."
     try:
+        # 1. Extraction of important terms from the question using NLU (Keywords and Concepts)
+        search_terms = []
         try:
+            question_analysis = nlu.analyze(
+                text=question,
                 features={'keywords': {}, 'concepts': {}}
             ).get_result()
+            for k in question_analysis.get('keywords', []):
+                search_terms.append(normalize_text(k['text']))
+            for c in question_analysis.get('concepts', []):
+                search_terms.append(normalize_text(c['text']))
         except:
+            pass # Fallback to manual extraction if NLU fails on short question
+        # If Watson doesn't return terms or fails, use manual split with normalization
+        if not search_terms:
+            search_terms = normalize_text(question).split()
+        if not search_terms:
+            # Last attempt: if everything fails, use the entire normalized question
+            search_terms = [normalize_text(question)]
+        # 2. Document text processing
+        # Normalize full text for search
+        normalized_text = normalize_text(text)
+        # Split document into smaller blocks (paragraphs)
+        raw_blocks = re.split(r'\n\s*\n', text)
+        if len(raw_blocks) < 2:
+            raw_blocks = text.split('\n')
+        valid_paragraphs = []
+        for block in raw_blocks:
+            clean = block.strip()
+            if len(clean) > 20: # Keep blocks with minimum content
+                valid_paragraphs.append({
+                    'original': clean,
+                    'normalized': normalize_text(clean)
                 })
+        # If still few blocks, try to split by sentences
+        if len(valid_paragraphs) < 3:
+            sentences = re.split(r'\.\s+', text)
+            valid_paragraphs = []
+            for s in sentences:
+                clean = s.strip()
+                if len(clean) > 20:
+                    valid_paragraphs.append({
+                        'original': clean,
+                        'normalized': normalize_text(clean)
                     })
+        # 3. Relevance calculation (Ranking)
+        best_paragraph = ""
+        highest_score = 0
+        for item in valid_paragraphs:
+            p_norm = item['normalized']
             score = 0
+            for term in search_terms:
+                if not term: continue
+                # If exact term (normalized) is in paragraph
+                if term in p_norm:
                     score += 1
+                    # Whole word bonus to avoid false-positives in substrings
+                    if re.search(rf'\b{re.escape(term)}\b', p_norm):
                         score += 2
+            # If score is equal, we prefer shorter (more specific) paragraph
+            if score > highest_score:
+                highest_score = score
+                best_paragraph = item['original']
+            elif score == highest_score and score > 0:
+                if len(item['original']) < len(best_paragraph):
+                    best_paragraph = item['original']
+        # 4. Result return
+        if best_paragraph and highest_score > 0:
+            return f"Based on the document, I found this relevant snippet:\n\n\"{best_paragraph}\""
         else:
+            return "Unfortunately I didn't find a direct answer in the document. Try rephrasing your question with other terms."
     except Exception as e:
+        return f"Error processing smart search: {str(e)}"
+# --- Smart Chat Functions (RAG with Watsonx AI) ---
+def get_iam_token():
+    """Generates an IAM access token using the Watsonx API Key."""
     url = "https://iam.cloud.ibm.com/identity/token"
     headers = {"Content-Type": "application/x-www-form-urlencoded"}
     data = f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={WATSONX_API_KEY}"
         if response.status_code == 200:
             return response.json().get("access_token")
         elif response.status_code == 400:
+            return f"Authentication Error (400): The provided API Key is invalid or not found. Check your .env file."
         else:
+            return f"Error generating token ({response.status_code}): {response.text}"
     except Exception as e:
+        return f"Connection error generating token: {str(e)}"
+def smart_chat(question, document_text):
+    """Performs a smart chat (RAG) using the Llama-3 model on Watsonx AI."""
+    if not question or not document_text:
+        return "Please analyze a document first and type a question."
+    token = get_iam_token()
+    if token.startswith("Error"):
         return token
     url = "https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29"
+    # Limit document text to not exceed model token limit
+    context = document_text[:10000] # Approximately 2500 tokens
     body = {
         "messages": [
             {
                 "role": "system",
                 "content": (
+                    "You are a helpful and honest AI assistant. "
+                    "Your task is to answer questions based EXCLUSIVELY on the content of the document provided below. "
+                    "If the answer is not in the text, say you didn't find the information in the document. "
+                    "Always answer in English and use Markdown formatting.\n\n"
+                    f"DOCUMENT CONTENT:\n{context}"
                 )
             },
             {
                 "role": "user",
+                "content": question
             }
         ],
         "project_id": PROJECT_ID,
     try:
         response = requests.post(url, headers=headers, json=body)
         if response.status_code != 200:
+            return f"Watsonx API Error: {response.text}"
         data = response.json()
         return data['choices'][0]['message']['content']
     except Exception as e:
+        return f"Chat processing error: {str(e)}"
+# --- Gradio Interface using Blocks ---
+def create_interface():
+    with gr.Blocks(title="Intelligent Document Analysis") as demo:
+        gr.Markdown("# 📑 Watsonx AI - Intelligent Document Analysis")
+        gr.Markdown("Extract information, summaries and ask questions about your PDF, DOCX or TXT documents.")
+        with gr.Tab("1. Extraction and Analysis"):
             with gr.Row():
                 with gr.Column():
+                    file_input = gr.File(label="Document Upload")
+                    analyze_button = gr.Button("Analyze Document", variant="primary")
                 with gr.Column():
+                    extracted_text = gr.Textbox(label="Extracted Text", lines=10, interactive=False)
             with gr.Row():
+                summary_output = gr.Textbox(label="Automatic Summary")
+                topics_output = gr.Textbox(label="Key Topics")
+                classification_output = gr.Textbox(label="Thematic Classification")
+        with gr.Tab("2. Snippet Locator (Semantic Search)"):
+            gr.Markdown("### 🔍 Find specific snippets in the document")
+            gr.Markdown("This tool locates the most relevant paragraphs containing your search terms.")
             with gr.Row():
+                question_input = gr.Textbox(label="What are you looking for in the text?", placeholder="Ex: Revenue goals")
+                question_button = gr.Button("Locate Snippet", variant="secondary")
+            answer_output = gr.Textbox(label="Most relevant snippet found", lines=10)
+        with gr.Tab("3. Smart Chat (RAG)"):
+            gr.Markdown("### 🤖 Ask the Artificial Intelligence")
+            gr.Markdown("The Llama-3 model will analyze the entire document to answer your questions with reasoning and synthesis.")
             with gr.Row():
+                chat_input = gr.Textbox(label="Your Question for IA", placeholder="Ex: What is the main theme of the document?")
+                chat_button = gr.Button("Generate IA Response", variant="primary")
             chat_output = gr.Markdown()
+        # Event definitions
+        def run_analysis_flow(file):
+            text = extract_text(file)
+            summary, topics, classification = process_text(text)
+            return text, summary, topics, classification
+        analyze_button.click(
+            fn=run_analysis_flow,
+            inputs=[file_input],
+            outputs=[extracted_text, summary_output, topics_output, classification_output]
         )
+        question_button.click(
+            fn=answer_question,
+            inputs=[question_input, extracted_text],
+            outputs=[answer_output]
         )
+        chat_button.click(
+            fn=smart_chat,
+            inputs=[chat_input, extracted_text],
             outputs=[chat_output]
         )
     return demo
 if __name__ == "__main__":
+    app = create_interface()
     app.launch()