First_agent_template

Sleeping

App Files Files Community

inank commited on Feb 22

Commit

9a4c8eb

verified ·

1 Parent(s): 937b418

fix: restore pdf extractor

Browse files

Files changed (1) hide show

tools/pdf_extractor.py +20 -105

tools/pdf_extractor.py CHANGED Viewed

@@ -1,115 +1,30 @@
-from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
-import datetime
-import requests
-import pytz
-import yaml
-from tools.final_answer import FinalAnswerTool
-from tools.pdf_extractor import extract_text_from_pdf
-from Gradio_UI import GradioUI
 @tool
-def summarize_and_analyze_text(text: str, max_sentences: int = 5) -> str:
-    """Analyzes and summarizes text content, extracting key information and main ideas.
-    This tool intelligently condenses lengthy text into concise summaries while preserving
-    the most important information. Perfect for processing search results, PDFs, and documents.
     Args:
-        text: The text content to summarize and analyze
-        max_sentences: Maximum number of sentences in the summary (default: 5)
     Returns:
-        A formatted summary containing key points and main ideas from the text
     """
     try:
-        # Remove extra whitespace and normalize text
-        text = " ".join(text.split())
-        if len(text) < 100:
-            return f"Text is too short to summarize. Original text:\n{text}"
-        # Split into sentences (simple approach)
-        sentences = []
-        import re
-        for sent in re.split(r'(?<=[.!?])\s+', text):
-            sent = sent.strip()
-            if sent:
-                sentences.append(sent)
-        # Score sentences based on word frequency
-        words = text.lower().split()
-        word_freq = {}
-        for word in words:
-            if len(word) > 3:  # Filter short words
-                word_freq[word] = word_freq.get(word, 0) + 1
-        # Select top sentences
-        sentence_scores = []
-        for i, sent in enumerate(sentences):
-            score = sum(word_freq.get(word.lower(), 0) for word in sent.split())
-            sentence_scores.append((i, score, sent))
-        # Sort by original order but select based on scores
-        top_indices = sorted([idx for idx, _, _ in sorted(sentence_scores, key=lambda x: -x[1])[:max_sentences]])
-        summary_sentences = [sent for idx, _, sent in sentence_scores if idx in top_indices]
-        summary = " ".join(summary_sentences)
-        # Extract key entities (words that appear frequently)
-        sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
-        key_terms = ", ".join([word for word, _ in sorted_words[:5]])
-        return f"""📋 SUMMARY:\n{summary}\n\n🔑 KEY TERMS: {key_terms}\n\n📊 ANALYSIS:\n- Text length: {len(text)} characters\n- Total sentences: {len(sentences)}\n- Summary length: {len(summary_sentences)} sentences"""
     except Exception as e:
-        return f"Error analyzing text: {str(e)}"
-@tool
-def get_current_time_in_timezone(timezone: str) -> str:
-    """A tool that fetches the current local time in a specified timezone.
-    Args:
-        timezone: A string representing a valid timezone (e.g., 'America/New_York').
-    """
-    try:
-        # Create timezone object
-        tz = pytz.timezone(timezone)
-        # Get current time in that timezone
-        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
-        return f"The current local time in {timezone} is: {local_time}"
-    except Exception as e:
-        return f"Error fetching time for timezone '{timezone}': {str(e)}"
-final_answer = FinalAnswerTool()
-# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
-# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'
-model = HfApiModel(
-max_tokens=2096,
-temperature=0.5,
-model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
-custom_role_conversions=None,
-)
-# Import tool from Hub
-image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
-with open("prompts.yaml", 'r') as stream:
-    prompt_templates = yaml.safe_load(stream)
-agent = CodeAgent(
-    model=model,
-    tools=[image_generation_tool,get_current_time_in_timezone,extract_text_from_pdf,summarize_and_analyze_text,final_answer], ## add your tools here (don't remove final answer)
-    max_steps=6,
-    verbosity_level=1,
-    grammar=None,
-    planning_interval=None,
-    name=None,
-    description=None,
-    prompt_templates=prompt_templates
-)
-GradioUI(agent, "/tmp").launch()

+from smolagents import tool
+import PyPDF2
 @tool
+def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extracts all text content from a PDF file.
     Args:
+        pdf_path: The file path to the PDF file to extract text from (e.g., '/tmp/document.pdf')
     Returns:
+        The extracted text content from the PDF file
     """
     try:
+        extracted_text = []
+        with open(pdf_path, 'rb') as pdf_file:
+            pdf_reader = PyPDF2.PdfReader(pdf_file)
+            num_pages = len(pdf_reader.pages)
+            for page_num in range(num_pages):
+                page = pdf_reader.pages[page_num]
+                text = page.extract_text()
+                extracted_text.append(f"--- Page {page_num + 1} ---\n{text}")
+        return "\n\n".join(extracted_text)
+    except FileNotFoundError:
+        return f"Error: PDF file not found at path: {pdf_path}"
     except Exception as e:
+        return f"Error extracting text from PDF: {str(e)}"