Spaces:

Tanaybh
/

DocMind

Runtime error

App Files Files Community

Tanaybh commited on Oct 6, 2025

Commit

3a5fdfb

verified ·

1 Parent(s): 422c3d1

Upload 4 files

Browse files

Files changed (4) hide show

agents.py +351 -0
fetch_arxiv_data.py +114 -0
retriever.py +201 -0
utils.py +231 -0

agents.py ADDED Viewed

	@@ -0,0 +1,351 @@

+"""
+DocMind - Multi-Agent System
+Implements Retriever, Reader, Critic, and Synthesizer agents
+"""
+from typing import List, Dict, Tuple
+from retriever import PaperRetriever
+import os
+class RetrieverAgent:
+    """Agent responsible for finding relevant papers"""
+    def __init__(self, retriever: PaperRetriever):
+        self.retriever = retriever
+    def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[Dict, float]]:
+        """
+        Retrieve relevant papers for the query
+        Returns:
+            List of (paper, relevance_score) tuples
+        """
+        print(f"🔍 Retriever Agent: Searching for '{query}'...")
+        results = self.retriever.search(query, top_k)
+        print(f"   Found {len(results)} relevant papers")
+        return results
+class ReaderAgent:
+    """Agent responsible for reading and summarizing papers"""
+    def __init__(self, llm_client=None):
+        """
+        Args:
+            llm_client: Optional LLM client (OpenAI, Anthropic, etc.)
+                       If None, uses rule-based summarization
+        """
+        self.llm_client = llm_client
+    def summarize_paper(self, paper: Dict) -> str:
+        """
+        Generate a summary of a single paper
+        Args:
+            paper: Paper dictionary with title, abstract, etc.
+        Returns:
+            Summary string
+        """
+        if self.llm_client:
+            return self._llm_summarize(paper)
+        else:
+            return self._rule_based_summarize(paper)
+    def _rule_based_summarize(self, paper: Dict) -> str:
+        """Simple extractive summary (first 3 sentences)"""
+        abstract = paper['abstract']
+        sentences = abstract.split('. ')
+        summary = '. '.join(sentences[:3]) + '.'
+        return {
+            'title': paper['title'],
+            'arxiv_id': paper['arxiv_id'],
+            'authors': paper['authors'][:3],
+            'summary': summary,
+            'year': paper['published'][:4]
+        }
+    def _llm_summarize(self, paper: Dict) -> str:
+        """Use LLM to generate intelligent summary"""
+        prompt = f"""Summarize this research paper in 2-3 sentences, focusing on:
+1. The main contribution/idea
+2. The key methodology or approach
+3. Important results or implications
+Title: {paper['title']}
+Abstract: {paper['abstract']}
+Summary:"""
+        # Call LLM (implementation depends on client)
+        # This is a placeholder - replace with actual LLM call
+        response = "LLM summary would go here"
+        return {
+            'title': paper['title'],
+            'arxiv_id': paper['arxiv_id'],
+            'authors': paper['authors'][:3],
+            'summary': response,
+            'year': paper['published'][:4]
+        }
+    def read_papers(self, papers: List[Tuple[Dict, float]]) -> List[Dict]:
+        """
+        Read and summarize multiple papers
+        Args:
+            papers: List of (paper, score) tuples from retriever
+        Returns:
+            List of summaries
+        """
+        print(f"📖 Reader Agent: Reading {len(papers)} papers...")
+        summaries = []
+        for paper, score in papers:
+            summary = self.summarize_paper(paper)
+            summary['relevance_score'] = score
+            summaries.append(summary)
+        print(f"   Generated {len(summaries)} summaries")
+        return summaries
+class CriticAgent:
+    """Agent responsible for evaluating and filtering summaries"""
+    def __init__(self, llm_client=None):
+        self.llm_client = llm_client
+    def critique(self, summaries: List[Dict], query: str) -> List[Dict]:
+        """
+        Evaluate summaries for quality and relevance
+        Args:
+            summaries: List of paper summaries
+            query: Original user query
+        Returns:
+            Filtered and scored summaries
+        """
+        print(f"🔎 Critic Agent: Evaluating {len(summaries)} summaries...")
+        # Simple rule-based filtering
+        filtered = []
+        for summary in summaries:
+            # Check relevance score threshold
+            if summary['relevance_score'] > 0.3:
+                # Add quality score (can be enhanced with LLM)
+                summary['quality_score'] = self._assess_quality(summary, query)
+                filtered.append(summary)
+        # Sort by combined score
+        filtered.sort(
+            key=lambda x: x['relevance_score'] * 0.7 + x['quality_score'] * 0.3,
+            reverse=True
+        )
+        print(f"   Retained {len(filtered)} high-quality summaries")
+        return filtered
+    def _assess_quality(self, summary: Dict, query: str) -> float:
+        """
+        Simple quality assessment (can be enhanced with LLM)
+        Returns:
+            Quality score 0-1
+        """
+        score = 0.5  # Base score
+        # Longer summaries might be more informative
+        if len(summary['summary']) > 100:
+            score += 0.2
+        # Recent papers get bonus
+        if int(summary['year']) >= 2024:
+            score += 0.3
+        return min(score, 1.0)
+class SynthesizerAgent:
+    """Agent responsible for synthesizing final answer"""
+    def __init__(self, llm_client=None):
+        self.llm_client = llm_client
+    def synthesize(
+            self,
+            summaries: List[Dict],
+            query: str,
+            max_papers: int = 10
+    ) -> str:
+        """
+        Synthesize final answer from summaries
+        Args:
+            summaries: List of filtered, quality summaries
+            query: Original user query
+            max_papers: Maximum papers to include in response
+        Returns:
+            Final synthesized response with citations
+        """
+        print(f"✨ Synthesizer Agent: Creating final response...")
+        if not summaries:
+            return "No relevant papers found for your query."
+        # Limit to top papers
+        top_summaries = summaries[:max_papers]
+        if self.llm_client:
+            return self._llm_synthesize(top_summaries, query)
+        else:
+            return self._rule_based_synthesize(top_summaries, query)
+    def _rule_based_synthesize(self, summaries: List[Dict], query: str) -> str:
+        """Create structured response without LLM"""
+        response = f"# Research Summary: {query}\n\n"
+        response += f"Based on {len(summaries)} relevant papers from arXiv:\n\n"
+        for i, summary in enumerate(summaries, 1):
+            response += f"## [{i}] {summary['title']}\n"
+            response += f"**Authors:** {', '.join(summary['authors'])}"
+            if len(summary['authors']) >= 3:
+                response += " et al."
+            response += f"\n**Year:** {summary['year']}\n"
+            response += f"**arXiv ID:** {summary['arxiv_id']}\n"
+            response += f"**Relevance:** {summary['relevance_score']:.2f}\n\n"
+            response += f"{summary['summary']}\n\n"
+            response += "---\n\n"
+        return response
+    def _llm_synthesize(self, summaries: List[Dict], query: str) -> str:
+        """Use LLM to create coherent synthesis"""
+        # Build context from summaries
+        context = ""
+        for i, summary in enumerate(summaries, 1):
+            context += f"[{i}] {summary['title']} ({summary['arxiv_id']})\n"
+            context += f"    {summary['summary']}\n\n"
+        prompt = f"""You are a research assistant. Based on the following papers, answer this question:
+Question: {query}
+Papers:
+{context}
+Provide a comprehensive answer that:
+1. Directly addresses the question
+2. Synthesizes information across papers
+3. Cites papers by number [1], [2], etc.
+4. Highlights key findings and consensus/disagreements
+5. Is concise but thorough (3-5 paragraphs)
+Answer:"""
+        # Placeholder for LLM call
+        response = "LLM-generated synthesis would go here with citations"
+        # Append paper references
+        response += "\n\n## References\n"
+        for i, summary in enumerate(summaries, 1):
+            response += f"[{i}] {summary['title']} "
+            response += f"({summary['arxiv_id']}, {summary['year']})\n"
+        return response
+class DocMindOrchestrator:
+    """Main orchestrator that coordinates all agents"""
+    def __init__(
+            self,
+            retriever: PaperRetriever,
+            llm_client=None
+    ):
+        self.retriever_agent = RetrieverAgent(retriever)
+        self.reader_agent = ReaderAgent(llm_client)
+        self.critic_agent = CriticAgent(llm_client)
+        self.synthesizer_agent = SynthesizerAgent(llm_client)
+    def process_query(
+            self,
+            query: str,
+            top_k: int = 10,
+            max_papers_in_response: int = 5
+    ) -> str:
+        """
+        Process user query through full agent pipeline
+        Args:
+            query: User question
+            top_k: Number of papers to retrieve
+            max_papers_in_response: Max papers in final response
+        Returns:
+            Final synthesized answer
+        """
+        print(f"\n{'=' * 60}")
+        print(f"Processing query: {query}")
+        print('=' * 60)
+        # Step 1: Retrieve
+        papers = self.retriever_agent.retrieve(query, top_k)
+        if not papers:
+            return "No relevant papers found for your query."
+        # Step 2: Read & Summarize
+        summaries = self.reader_agent.read_papers(papers)
+        # Step 3: Critique & Filter
+        quality_summaries = self.critic_agent.critique(summaries, query)
+        # Step 4: Synthesize
+        final_response = self.synthesizer_agent.synthesize(
+            quality_summaries,
+            query,
+            max_papers_in_response
+        )
+        print(f"{'=' * 60}\n")
+        return final_response
+def main():
+    """Example usage of multi-agent system"""
+    from fetch_arxiv_data import ArxivFetcher
+    # Setup
+    fetcher = ArxivFetcher()
+    retriever = PaperRetriever()
+    # Load or build index
+    if not retriever.load_index():
+        papers = fetcher.load_papers("arxiv_papers.json")
+        retriever.build_index(papers)
+        retriever.save_index()
+    # Create orchestrator
+    orchestrator = DocMindOrchestrator(retriever)
+    # Test queries
+    test_queries = [
+        "What are the latest improvements in diffusion models?",
+        "How does RLHF compare to DPO for language model alignment?",
+        "What are the main challenges in scaling transformers?"
+    ]
+    for query in test_queries:
+        response = orchestrator.process_query(query, top_k=8, max_papers_in_response=3)
+        print(response)
+        print("\n" + "=" * 80 + "\n")
+if __name__ == "__main__":
+    main()

fetch_arxiv_data.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+DocMind - arXiv Data Fetcher
+Fetches papers from arXiv API and saves them for indexing
+"""
+import arxiv
+import os
+import json
+from pathlib import Path
+from typing import List, Dict
+class ArxivFetcher:
+    def __init__(self, data_dir: str = "data/papers"):
+        self.data_dir = Path(data_dir)
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+    def fetch_papers(
+            self,
+            query: str = "machine learning",
+            max_results: int = 100,
+            category: str = None
+    ) -> List[Dict]:
+        """
+        Fetch papers from arXiv API
+        Args:
+            query: Search query string
+            max_results: Maximum number of papers to fetch
+            category: arXiv category (e.g., 'cs.AI', 'cs.LG')
+        Returns:
+            List of paper dictionaries
+        """
+        print(f"Fetching papers from arXiv: query='{query}', max={max_results}")
+        # Build search query
+        search_query = query
+        if category:
+            search_query = f"cat:{category} AND {query}"
+        search = arxiv.Search(
+            query=search_query,
+            max_results=max_results,
+            sort_by=arxiv.SortCriterion.SubmittedDate
+        )
+        papers = []
+        for result in search.results():
+            paper = {
+                'arxiv_id': result.entry_id.split('/')[-1],
+                'title': result.title,
+                'authors': [author.name for author in result.authors],
+                'abstract': result.summary,
+                'published': result.published.strftime('%Y-%m-%d'),
+                'pdf_url': result.pdf_url,
+                'categories': result.categories
+            }
+            papers.append(paper)
+        print(f"Successfully fetched {len(papers)} papers")
+        return papers
+    def save_papers(self, papers: List[Dict], filename: str = "papers.json"):
+        """Save papers to JSON file"""
+        filepath = self.data_dir / filename
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(papers, f, indent=2, ensure_ascii=False)
+        print(f"Saved {len(papers)} papers to {filepath}")
+    def load_papers(self, filename: str = "papers.json") -> List[Dict]:
+        """Load papers from JSON file"""
+        filepath = self.data_dir / filename
+        if not filepath.exists():
+            print(f"No saved papers found at {filepath}")
+            return []
+        with open(filepath, 'r', encoding='utf-8') as f:
+            papers = json.load(f)
+        print(f"Loaded {len(papers)} papers from {filepath}")
+        return papers
+def main():
+    """Example usage: Fetch recent ML and AI papers"""
+    fetcher = ArxivFetcher()
+    # Fetch recent ML papers
+    ml_papers = fetcher.fetch_papers(
+        query="machine learning OR deep learning",
+        max_results=50,
+        category="cs.LG"
+    )
+    # Fetch recent AI papers
+    ai_papers = fetcher.fetch_papers(
+        query="artificial intelligence OR neural networks",
+        max_results=50,
+        category="cs.AI"
+    )
+    # Combine and save
+    all_papers = ml_papers + ai_papers
+    fetcher.save_papers(all_papers, "arxiv_papers.json")
+    # Show sample
+    print("\n=== Sample Paper ===")
+    print(f"Title: {all_papers[0]['title']}")
+    print(f"Authors: {', '.join(all_papers[0]['authors'][:3])}")
+    print(f"Abstract: {all_papers[0]['abstract'][:200]}...")
+if __name__ == "__main__":
+    main()

retriever.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+DocMind - Retriever Module
+Semantic search over arXiv papers using FAISS and sentence-transformers
+"""
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from typing import List, Dict, Tuple
+import pickle
+from pathlib import Path
+class PaperRetriever:
+    def __init__(
+            self,
+            model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+            index_path: str = "data/faiss_index"
+    ):
+        """
+        Initialize retriever with embedding model and FAISS index
+        Args:
+            model_name: HuggingFace sentence-transformer model
+            index_path: Directory to save/load FAISS index
+        """
+        print(f"Loading embedding model: {model_name}")
+        self.model = SentenceTransformer(model_name)
+        self.index_path = Path(index_path)
+        self.index_path.mkdir(parents=True, exist_ok=True)
+        self.index = None
+        self.papers = []
+        self.embeddings = None
+    def build_index(self, papers: List[Dict]):
+        """
+        Build FAISS index from papers
+        Args:
+            papers: List of paper dictionaries with 'title' and 'abstract'
+        """
+        print(f"Building index for {len(papers)} papers...")
+        self.papers = papers
+        # Create text to embed: title + abstract
+        texts = [
+            f"{paper['title']}. {paper['abstract']}"
+            for paper in papers
+        ]
+        # Generate embeddings
+        print("Generating embeddings...")
+        self.embeddings = self.model.encode(
+            texts,
+            show_progress_bar=True,
+            convert_to_numpy=True
+        )
+        # Build FAISS index
+        dimension = self.embeddings.shape[1]
+        self.index = faiss.IndexFlatIP(dimension)  # Inner product (cosine similarity)
+        # Normalize embeddings for cosine similarity
+        faiss.normalize_L2(self.embeddings)
+        self.index.add(self.embeddings)
+        print(f"Index built with {self.index.ntotal} papers")
+    def save_index(self, name: str = "papers"):
+        """Save FAISS index and metadata"""
+        faiss.write_index(self.index, str(self.index_path / f"{name}.index"))
+        with open(self.index_path / f"{name}_papers.pkl", 'wb') as f:
+            pickle.dump(self.papers, f)
+        with open(self.index_path / f"{name}_embeddings.npy", 'wb') as f:
+            np.save(f, self.embeddings)
+        print(f"Saved index to {self.index_path}/{name}.*")
+    def load_index(self, name: str = "papers"):
+        """Load FAISS index and metadata"""
+        index_file = self.index_path / f"{name}.index"
+        if not index_file.exists():
+            print(f"No index found at {index_file}")
+            return False
+        self.index = faiss.read_index(str(index_file))
+        with open(self.index_path / f"{name}_papers.pkl", 'rb') as f:
+            self.papers = pickle.load(f)
+        with open(self.index_path / f"{name}_embeddings.npy", 'rb') as f:
+            self.embeddings = np.load(f)
+        print(f"Loaded index with {len(self.papers)} papers")
+        return True
+    def search(
+            self,
+            query: str,
+            top_k: int = 5
+    ) -> List[Tuple[Dict, float]]:
+        """
+        Search for relevant papers
+        Args:
+            query: Search query string
+            top_k: Number of results to return
+        Returns:
+            List of (paper_dict, score) tuples
+        """
+        if self.index is None:
+            raise ValueError("Index not built or loaded")
+        # Embed query
+        query_embedding = self.model.encode([query], convert_to_numpy=True)
+        faiss.normalize_L2(query_embedding)
+        # Search
+        scores, indices = self.index.search(query_embedding, top_k)
+        # Return results
+        results = []
+        for idx, score in zip(indices[0], scores[0]):
+            paper = self.papers[idx]
+            results.append((paper, float(score)))
+        return results
+    def get_retrieval_context(
+            self,
+            query: str,
+            top_k: int = 5
+    ) -> str:
+        """
+        Get formatted context string for LLM consumption
+        Args:
+            query: Search query
+            top_k: Number of papers to retrieve
+        Returns:
+            Formatted context string with paper summaries
+        """
+        results = self.search(query, top_k)
+        context = f"Retrieved {len(results)} relevant papers:\n\n"
+        for i, (paper, score) in enumerate(results, 1):
+            context += f"[{i}] {paper['title']}\n"
+            context += f"    Authors: {', '.join(paper['authors'][:3])}"
+            if len(paper['authors']) > 3:
+                context += f" et al."
+            context += f"\n    arXiv ID: {paper['arxiv_id']}\n"
+            context += f"    Published: {paper['published']}\n"
+            context += f"    Relevance: {score:.3f}\n"
+            context += f"    Abstract: {paper['abstract']}\n\n"
+        return context
+def main():
+    """Example: Build and test retriever"""
+    from fetch_arxiv_data import ArxivFetcher
+    # Load papers
+    fetcher = ArxivFetcher()
+    papers = fetcher.load_papers("arxiv_papers.json")
+    if not papers:
+        print("No papers found. Run fetch_arxiv_data.py first")
+        return
+    # Build index
+    retriever = PaperRetriever()
+    retriever.build_index(papers)
+    retriever.save_index()
+    # Test search
+    test_queries = [
+        "diffusion models for image generation",
+        "reinforcement learning from human feedback",
+        "large language model alignment"
+    ]
+    for query in test_queries:
+        print(f"\n{'=' * 60}")
+        print(f"Query: {query}")
+        print('=' * 60)
+        results = retriever.search(query, top_k=3)
+        for i, (paper, score) in enumerate(results, 1):
+            print(f"\n[{i}] Score: {score:.3f}")
+            print(f"    {paper['title']}")
+            print(f"    {paper['arxiv_id']}")
+if __name__ == "__main__":
+    main()

utils.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+DocMind - Utility Functions
+Helper functions for the multi-agent system
+"""
+from typing import List, Dict
+import re
+from datetime import datetime
+def clean_text(text: str) -> str:
+    """Clean and normalize text"""
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters but keep basic punctuation
+    text = re.sub(r'[^\w\s.,!?;:()\-]', '', text)
+    return text.strip()
+def truncate_text(text: str, max_length: int = 500) -> str:
+    """Truncate text to maximum length, ending at sentence boundary"""
+    if len(text) <= max_length:
+        return text
+    # Find last sentence boundary before max_length
+    truncated = text[:max_length]
+    last_period = truncated.rfind('.')
+    if last_period > 0:
+        return truncated[:last_period + 1]
+    return truncated + "..."
+def format_authors(authors: List[str], max_authors: int = 3) -> str:
+    """Format author list for display"""
+    if len(authors) <= max_authors:
+        return ", ".join(authors)
+    else:
+        return ", ".join(authors[:max_authors]) + " et al."
+def extract_year(date_string: str) -> int:
+    """Extract year from date string"""
+    try:
+        if isinstance(date_string, str):
+            return int(date_string[:4])
+        return datetime.now().year
+    except:
+        return datetime.now().year
+def score_recency(year: int, current_year: int = None) -> float:
+    """
+    Score paper based on recency
+    Returns:
+        Score from 0-1, where 1 is most recent
+    """
+    if current_year is None:
+        current_year = datetime.now().year
+    age = current_year - year
+    if age <= 0:
+        return 1.0
+    elif age <= 1:
+        return 0.9
+    elif age <= 2:
+        return 0.7
+    elif age <= 3:
+        return 0.5
+    else:
+        return max(0.3, 1.0 / (age + 1))
+def combine_scores(
+        relevance: float,
+        recency: float,
+        quality: float,
+        weights: Dict[str, float] = None
+) -> float:
+    """
+    Combine multiple scores with weights
+    Args:
+        relevance: Relevance score (0-1)
+        recency: Recency score (0-1)
+        quality: Quality score (0-1)
+        weights: Dict with keys 'relevance', 'recency', 'quality'
+    Returns:
+        Combined score (0-1)
+    """
+    if weights is None:
+        weights = {
+            'relevance': 0.6,
+            'recency': 0.2,
+            'quality': 0.2
+        }
+    return (
+            relevance * weights['relevance'] +
+            recency * weights['recency'] +
+            quality * weights['quality']
+    )
+def deduplicate_papers(papers: List[Dict]) -> List[Dict]:
+    """Remove duplicate papers based on arXiv ID"""
+    seen = set()
+    unique = []
+    for paper in papers:
+        paper_id = paper.get('arxiv_id', '')
+        if paper_id and paper_id not in seen:
+            seen.add(paper_id)
+            unique.append(paper)
+    return unique
+def format_citation(paper: Dict, style: str = 'apa') -> str:
+    """
+    Format paper citation
+    Args:
+        paper: Paper dict with title, authors, year, arxiv_id
+        style: Citation style ('apa', 'simple', 'markdown')
+    Returns:
+        Formatted citation string
+    """
+    authors = format_authors(paper.get('authors', []))
+    title = paper.get('title', 'Unknown Title')
+    year = extract_year(paper.get('published', ''))
+    arxiv_id = paper.get('arxiv_id', '')
+    if style == 'apa':
+        return f"{authors} ({year}). {title}. arXiv:{arxiv_id}"
+    elif style == 'markdown':
+        return f"**{title}** - {authors} ({year}) - arXiv:[{arxiv_id}](https://arxiv.org/abs/{arxiv_id})"
+    else:  # simple
+        return f"{title} ({arxiv_id}, {year})"
+def extract_keywords(text: str, top_n: int = 5) -> List[str]:
+    """
+    Extract simple keywords from text (frequency-based)
+    Args:
+        text: Input text
+        top_n: Number of keywords to return
+    Returns:
+        List of top keywords
+    """
+    # Simple word frequency approach
+    # Remove common words
+    stop_words = {
+        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+        'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
+        'this', 'that', 'these', 'those', 'we', 'our', 'propose', 'show'
+    }
+    # Tokenize and count
+    words = re.findall(r'\b[a-z]{4,}\b', text.lower())
+    word_freq = {}
+    for word in words:
+        if word not in stop_words:
+            word_freq[word] = word_freq.get(word, 0) + 1
+    # Get top N
+    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+    return [word for word, freq in sorted_words[:top_n]]
+class ProgressTracker:
+    """Simple progress tracker for multi-step processes"""
+    def __init__(self, total_steps: int):
+        self.total_steps = total_steps
+        self.current_step = 0
+        self.step_names = []
+    def next_step(self, step_name: str = None):
+        """Move to next step"""
+        self.current_step += 1
+        if step_name:
+            self.step_names.append(step_name)
+    def get_progress(self) -> float:
+        """Get progress as percentage"""
+        return (self.current_step / self.total_steps) * 100
+    def get_status(self) -> str:
+        """Get status string"""
+        return f"Step {self.current_step}/{self.total_steps} ({self.get_progress():.1f}%)"
+def validate_paper_dict(paper: Dict) -> bool:
+    """Validate that paper dictionary has required fields"""
+    required_fields = ['title', 'abstract', 'arxiv_id', 'authors', 'published']
+    return all(field in paper for field in required_fields)
+def safe_get(dictionary: Dict, key: str, default=None):
+    """Safely get value from dictionary with fallback"""
+    try:
+        return dictionary.get(key, default)
+    except:
+        return default
+# Example usage
+if __name__ == "__main__":
+    # Test utilities
+    sample_paper = {
+        'title': 'Attention Is All You Need',
+        'authors': ['Vaswani', 'Shazeer', 'Parmar', 'Uszkoreit'],
+        'published': '2017-06-12',
+        'arxiv_id': '1706.03762',
+        'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks...'
+    }
+    print("Citation (APA):", format_citation(sample_paper, 'apa'))
+    print("Citation (Markdown):", format_citation(sample_paper, 'markdown'))
+    print("Authors:", format_authors(sample_paper['authors']))
+    print("Recency score:", score_recency(2017))
+    print("Keywords:", extract_keywords(sample_paper['abstract']))