inank commited on
Commit
9a4c8eb
·
verified ·
1 Parent(s): 937b418

fix: restore pdf extractor

Browse files
Files changed (1) hide show
  1. tools/pdf_extractor.py +20 -105
tools/pdf_extractor.py CHANGED
@@ -1,115 +1,30 @@
1
- from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
2
- import datetime
3
- import requests
4
- import pytz
5
- import yaml
6
- from tools.final_answer import FinalAnswerTool
7
- from tools.pdf_extractor import extract_text_from_pdf
8
 
9
- from Gradio_UI import GradioUI
10
 
11
  @tool
12
- def summarize_and_analyze_text(text: str, max_sentences: int = 5) -> str:
13
- """Analyzes and summarizes text content, extracting key information and main ideas.
14
-
15
- This tool intelligently condenses lengthy text into concise summaries while preserving
16
- the most important information. Perfect for processing search results, PDFs, and documents.
17
 
18
  Args:
19
- text: The text content to summarize and analyze
20
- max_sentences: Maximum number of sentences in the summary (default: 5)
21
 
22
  Returns:
23
- A formatted summary containing key points and main ideas from the text
24
  """
25
  try:
26
- # Remove extra whitespace and normalize text
27
- text = " ".join(text.split())
28
-
29
- if len(text) < 100:
30
- return f"Text is too short to summarize. Original text:\n{text}"
31
-
32
- # Split into sentences (simple approach)
33
- sentences = []
34
- import re
35
- for sent in re.split(r'(?<=[.!?])\s+', text):
36
- sent = sent.strip()
37
- if sent:
38
- sentences.append(sent)
39
-
40
- # Score sentences based on word frequency
41
- words = text.lower().split()
42
- word_freq = {}
43
- for word in words:
44
- if len(word) > 3: # Filter short words
45
- word_freq[word] = word_freq.get(word, 0) + 1
46
-
47
- # Select top sentences
48
- sentence_scores = []
49
- for i, sent in enumerate(sentences):
50
- score = sum(word_freq.get(word.lower(), 0) for word in sent.split())
51
- sentence_scores.append((i, score, sent))
52
-
53
- # Sort by original order but select based on scores
54
- top_indices = sorted([idx for idx, _, _ in sorted(sentence_scores, key=lambda x: -x[1])[:max_sentences]])
55
- summary_sentences = [sent for idx, _, sent in sentence_scores if idx in top_indices]
56
-
57
- summary = " ".join(summary_sentences)
58
-
59
- # Extract key entities (words that appear frequently)
60
- sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
61
- key_terms = ", ".join([word for word, _ in sorted_words[:5]])
62
-
63
- return f"""📋 SUMMARY:\n{summary}\n\n🔑 KEY TERMS: {key_terms}\n\n📊 ANALYSIS:\n- Text length: {len(text)} characters\n- Total sentences: {len(sentences)}\n- Summary length: {len(summary_sentences)} sentences"""
64
  except Exception as e:
65
- return f"Error analyzing text: {str(e)}"
66
-
67
- @tool
68
- def get_current_time_in_timezone(timezone: str) -> str:
69
- """A tool that fetches the current local time in a specified timezone.
70
- Args:
71
- timezone: A string representing a valid timezone (e.g., 'America/New_York').
72
- """
73
- try:
74
- # Create timezone object
75
- tz = pytz.timezone(timezone)
76
- # Get current time in that timezone
77
- local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
78
- return f"The current local time in {timezone} is: {local_time}"
79
- except Exception as e:
80
- return f"Error fetching time for timezone '{timezone}': {str(e)}"
81
-
82
-
83
- final_answer = FinalAnswerTool()
84
-
85
- # If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
86
- # model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'
87
-
88
- model = HfApiModel(
89
- max_tokens=2096,
90
- temperature=0.5,
91
- model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
92
- custom_role_conversions=None,
93
- )
94
-
95
-
96
- # Import tool from Hub
97
- image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
98
-
99
- with open("prompts.yaml", 'r') as stream:
100
- prompt_templates = yaml.safe_load(stream)
101
-
102
- agent = CodeAgent(
103
- model=model,
104
- tools=[image_generation_tool,get_current_time_in_timezone,extract_text_from_pdf,summarize_and_analyze_text,final_answer], ## add your tools here (don't remove final answer)
105
- max_steps=6,
106
- verbosity_level=1,
107
- grammar=None,
108
- planning_interval=None,
109
- name=None,
110
- description=None,
111
- prompt_templates=prompt_templates
112
- )
113
-
114
-
115
- GradioUI(agent, "/tmp").launch()
 
1
+ from smolagents import tool
2
+ import PyPDF2
 
 
 
 
 
3
 
 
4
 
5
  @tool
6
+ def extract_text_from_pdf(pdf_path: str) -> str:
7
+ """Extracts all text content from a PDF file.
 
 
 
8
 
9
  Args:
10
+ pdf_path: The file path to the PDF file to extract text from (e.g., '/tmp/document.pdf')
 
11
 
12
  Returns:
13
+ The extracted text content from the PDF file
14
  """
15
  try:
16
+ extracted_text = []
17
+ with open(pdf_path, 'rb') as pdf_file:
18
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
19
+ num_pages = len(pdf_reader.pages)
20
+
21
+ for page_num in range(num_pages):
22
+ page = pdf_reader.pages[page_num]
23
+ text = page.extract_text()
24
+ extracted_text.append(f"--- Page {page_num + 1} ---\n{text}")
25
+
26
+ return "\n\n".join(extracted_text)
27
+ except FileNotFoundError:
28
+ return f"Error: PDF file not found at path: {pdf_path}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
+ return f"Error extracting text from PDF: {str(e)}"