Spaces:

fccoelho
/

reference_extractor

Sleeping

App Files Files Community

fccoelho aider (anthropic/claude-sonnet-4-20250514) commited on Sep 12, 2025

Commit

64597df

1 Parent(s): 32d9fa7

feat: adicionar extração de referências por regex e segunda tabela

Browse files

Co-authored-by: aider (anthropic/claude-sonnet-4-20250514) <aider@aider.chat>

Files changed (1) hide show

app.py +119 -13

app.py CHANGED Viewed

@@ -116,36 +116,131 @@ def extract_references_with_llm(text, model_name):
     except Exception as e:
         return [{"error": f"Erro ao processar com LLM ({model_name}): {str(e)}"}]
 def process_pdf(pdf_file, model_name):
     """Função principal que processa o PDF e retorna resultados"""
     if pdf_file is None:
-        return {"error": "Nenhum arquivo enviado"}, pd.DataFrame()
     # Extrair texto do PDF
     text, metadata = extract_pdf_text(pdf_file)
     if text is None:
-        return metadata, pd.DataFrame()
     # Adicionar modelo selecionado aos metadados
     metadata["modelo_usado"] = model_name
     # Extrair referências com LLM
-    references = extract_references_with_llm(text, model_name)
-    # Converter para DataFrame
-    if references and not any("error" in ref for ref in references):
-        df = pd.DataFrame(references)
     else:
-        df = pd.DataFrame({"Erro": ["Não foi possível extrair referências"]})
-    return metadata, df
 def create_interface():
     """Cria a interface Gradio"""
     with gr.Blocks(title="Extrator de Referências") as interface:
         gr.Markdown("# 📚 Extrator de Referências de Artigos Científicos")
-        gr.Markdown("Faça upload de um PDF de artigo científico para extrair automaticamente a lista de referências.")
         with gr.Row():
             with gr.Column():
@@ -171,22 +266,33 @@ def create_interface():
         extract_btn = gr.Button("🔍 Extrair Referências", variant="primary")
         with gr.Row():
             with gr.Column():
-                metadata_output = gr.JSON(label="📋 Metadados do Artigo")
             with gr.Column():
-                references_output = gr.Dataframe(
-                    label="📖 Lista de Referências",
                     row_count=(10,'dynamic'),
                     show_copy_button=True,
                     show_fullscreen_button=True,
                     wrap=True
                 )
         extract_btn.click(
             process_pdf,
             inputs=[pdf_input, model_dropdown],
-            outputs=[metadata_output, references_output]
         )
     return interface

     except Exception as e:
         return [{"error": f"Erro ao processar com LLM ({model_name}): {str(e)}"}]
+def extract_references_with_regex(text):
+    """Extrai referências usando expressões regulares"""
+    try:
+        # Encontrar a seção de referências
+        references_section = ""
+        # Padrões para identificar início da seção de referências
+        ref_patterns = [
+            r'(?i)references?\s*\n',
+            r'(?i)bibliography\s*\n',
+            r'(?i)literatura\s+citada\s*\n',
+            r'(?i)referências\s+bibliográficas\s*\n'
+        ]
+        for pattern in ref_patterns:
+            match = re.search(pattern, text)
+            if match:
+                references_section = text[match.end():]
+                break
+        if not references_section:
+            # Se não encontrou seção específica, usar últimos 30% do texto
+            references_section = text[int(len(text) * 0.7):]
+        # Padrões para extrair referências individuais
+        # Padrão básico: Autor(es). (Ano). Título. Journal/Editora.
+        ref_pattern = r'([A-Z][^.]*?)\.\s*\((\d{4})\)\.\s*([^.]+)\.\s*([^.]+?)(?:\.|$)'
+        # Padrão alternativo para referências numeradas
+        numbered_pattern = r'\[\d+\]\s*([A-Z][^.]*?)\.\s*\((\d{4})\)\.\s*([^.]+)\.\s*([^.]+?)(?:\.|$)'
+        # Padrão para referências com formato diferente
+        alt_pattern = r'([A-Z][A-Za-z\s,&]+)\s+\((\d{4})\)[.,]\s*([^.]+)[.,]\s*([^.]+?)(?:\.|$)'
+        references = []
+        # Tentar diferentes padrões
+        for pattern in [ref_pattern, numbered_pattern, alt_pattern]:
+            matches = re.findall(pattern, references_section, re.MULTILINE | re.DOTALL)
+            for match in matches:
+                if len(match) >= 4:
+                    # Limpar e processar os dados extraídos
+                    authors = match[0].strip()
+                    year = match[1].strip()
+                    title = match[2].strip()
+                    journal = match[3].strip()
+                    # Extrair DOI se presente
+                    doi_match = re.search(r'doi[:\s]*([^\s]+)', journal, re.IGNORECASE)
+                    doi = doi_match.group(1) if doi_match else ""
+                    # Extrair volume e páginas
+                    vol_pages_match = re.search(r'(\d+)\s*\(?\d*\)?\s*[,:]\s*(\d+[-–]\d+)', journal)
+                    volume = vol_pages_match.group(1) if vol_pages_match else ""
+                    pages = vol_pages_match.group(2) if vol_pages_match else ""
+                    references.append({
+                        "authors": authors,
+                        "title": title,
+                        "journal": journal,
+                        "year": year,
+                        "volume": volume,
+                        "pages": pages,
+                        "doi": doi
+                    })
+        # Remover duplicatas baseadas no título
+        seen_titles = set()
+        unique_references = []
+        for ref in references:
+            title_key = ref["title"].lower().strip()
+            if title_key not in seen_titles and len(title_key) > 10:
+                seen_titles.add(title_key)
+                unique_references.append(ref)
+        return unique_references[:50]  # Limitar a 50 referências para evitar ruído
+    except Exception as e:
+        return [{"error": f"Erro na extração por regex: {str(e)}"}]
 def process_pdf(pdf_file, model_name):
     """Função principal que processa o PDF e retorna resultados"""
     if pdf_file is None:
+        return {"error": "Nenhum arquivo enviado"}, pd.DataFrame(), pd.DataFrame(), "❌ Nenhum arquivo enviado"
     # Extrair texto do PDF
     text, metadata = extract_pdf_text(pdf_file)
     if text is None:
+        return metadata, pd.DataFrame(), pd.DataFrame(), "❌ Erro ao processar PDF"
     # Adicionar modelo selecionado aos metadados
     metadata["modelo_usado"] = model_name
     # Extrair referências com LLM
+    llm_references = extract_references_with_llm(text, model_name)
+    # Extrair referências com Regex
+    regex_references = extract_references_with_regex(text)
+    # Converter para DataFrames
+    if llm_references and not any("error" in ref for ref in llm_references):
+        llm_df = pd.DataFrame(llm_references)
+    else:
+        llm_df = pd.DataFrame({"Erro": ["Não foi possível extrair referências com LLM"]})
+    if regex_references and not any("error" in ref for ref in regex_references):
+        regex_df = pd.DataFrame(regex_references)
     else:
+        regex_df = pd.DataFrame({"Erro": ["Não foi possível extrair referências com Regex"]})
+    # Criar status
+    llm_count = len(llm_references) if llm_references and not any("error" in ref for ref in llm_references) else 0
+    regex_count = len(regex_references) if regex_references and not any("error" in ref for ref in regex_references) else 0
+    status = f"📊 **Resultados da Extração:**\n- LLM ({model_name}): {llm_count} referências\n- Regex: {regex_count} referências"
+    return metadata, llm_df, regex_df, status
 def create_interface():
     """Cria a interface Gradio"""
     with gr.Blocks(title="Extrator de Referências") as interface:
         gr.Markdown("# 📚 Extrator de Referências de Artigos Científicos")
+        gr.Markdown("Faça upload de um PDF de artigo científico para extrair automaticamente a lista de referências usando IA e expressões regulares.")
         with gr.Row():
             with gr.Column():
         extract_btn = gr.Button("🔍 Extrair Referências", variant="primary")
+        with gr.Row():
+            metadata_output = gr.JSON(label="📋 Metadados do Artigo")
         with gr.Row():
             with gr.Column():
+                llm_references_output = gr.Dataframe(
+                    label="🤖 Referências Extraídas por IA",
+                    row_count=(10,'dynamic'),
+                    show_copy_button=True,
+                    show_fullscreen_button=True,
+                    wrap=True
+                )
             with gr.Column():
+                regex_references_output = gr.Dataframe(
+                    label="🔍 Referências Extraídas por Regex",
                     row_count=(10,'dynamic'),
                     show_copy_button=True,
                     show_fullscreen_button=True,
                     wrap=True
                 )
+        status_output = gr.Markdown(label="📊 Status da Extração")
         extract_btn.click(
             process_pdf,
             inputs=[pdf_input, model_dropdown],
+            outputs=[metadata_output, llm_references_output, regex_references_output, status_output]
         )
     return interface