Charles Azam commited on
Commit
ce79b68
·
1 Parent(s): e040f4f

feat: add test for crawl_database

Browse files
src/deepengineer/deepsearch/analyse_markdown_agent.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  from smolagents import CodeAgent, tool, Tool, LiteLLMModel
2
  from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
3
  from mistralai import OCRResponse
 
1
+ """
2
+ Simple agent to analyse a markdown, just to test some ideas.
3
+ """
4
+
5
  from smolagents import CodeAgent, tool, Tool, LiteLLMModel
6
  from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
7
  from mistralai import OCRResponse
src/deepengineer/webcrawler/pdf_utils.py CHANGED
@@ -7,7 +7,7 @@ from mistralai import Mistral
7
  import os
8
  from litellm import completion
9
 
10
- from mistralai.models import OCRResponse, OCRPageObject
11
  import yaml
12
  from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
13
  from litellm.exceptions import BadRequestError
@@ -106,9 +106,10 @@ def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
106
  return table_of_contents
107
 
108
  def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
109
- # split by big title starting with # and then a space
110
- pages = raw_markdown.split("\n# ")
111
- return OCRResponse(pages=[OCRPageObject(markdown="# " + page, page_number=i) for i, page in enumerate(pages)])
 
112
 
113
 
114
 
 
7
  import os
8
  from litellm import completion
9
 
10
+ from mistralai.models import OCRResponse, OCRPageObject, OCRUsageInfo
11
  import yaml
12
  from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
13
  from litellm.exceptions import BadRequestError
 
106
  return table_of_contents
107
 
108
  def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
109
+ pages = raw_markdown.split("# ")
110
+ usage_info_empty = OCRUsageInfo(pages_processed=0)
111
+ return OCRResponse(pages=[OCRPageObject(index=i, markdown="# " + page, images=[], dimensions=None) for i, page in enumerate(pages)], usage_info=usage_info_empty, model="",)
112
+
113
 
114
 
115
 
tests/webcrawler/test_crawl_database.py CHANGED
@@ -16,4 +16,9 @@ def test_crawl_database_arxiv_link():
16
  assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown is not None
17
  assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
18
 
19
-
 
 
 
 
 
 
16
  assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown is not None
17
  assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
18
 
19
+ def test_crawl_database_wikipedia_url():
20
+ db = DataBase()
21
+ db.crawl_url("https://en.wikipedia.org/wiki/Deep_learning")
22
+ assert db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning") is not None
23
+ assert db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages[0].markdown is not None
24
+ assert len(db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages) >= 40