alessio-vertemati commited on
Commit
58c1383
·
1 Parent(s): dfd9c92

Url cache

Browse files
Files changed (1) hide show
  1. app.py +35 -1
app.py CHANGED
@@ -4,13 +4,18 @@ import asyncio
4
  import json
5
  import tiktoken
6
  import requests
7
- from typing import List, Tuple, Optional
 
8
  from dataclasses import dataclass
9
  from dotenv import load_dotenv
10
 
11
  # Load environment variables
12
  load_dotenv()
13
 
 
 
 
 
14
  def count_tokens(text: str, model: str) -> Tuple[int, str]:
15
  """Count tokens in text using the specified model encoding.
16
 
@@ -45,6 +50,28 @@ def count_tokens_from_url(url: str, model: str) -> Tuple[int, int, str]:
45
  return 0, 0, "No URL provided"
46
 
47
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  # Fetch as HTML
49
  html_response = requests.get(
50
  url,
@@ -63,6 +90,13 @@ def count_tokens_from_url(url: str, model: str) -> Tuple[int, int, str]:
63
  markdown_response.raise_for_status()
64
  markdown_content = markdown_response.text
65
 
 
 
 
 
 
 
 
66
  # Count tokens for both
67
  encoding = tiktoken.encoding_for_model(model)
68
  html_tokens = len(encoding.encode(html_content))
 
4
  import json
5
  import tiktoken
6
  import requests
7
+ import time
8
+ from typing import List, Tuple, Optional, Dict
9
  from dataclasses import dataclass
10
  from dotenv import load_dotenv
11
 
12
  # Load environment variables
13
  load_dotenv()
14
 
15
+ # URL response cache: {url: {"html": str, "markdown": str, "timestamp": float}}
16
+ _url_cache: Dict[str, Dict] = {}
17
+ CACHE_DURATION = 900 # 15 minutes in seconds
18
+
19
  def count_tokens(text: str, model: str) -> Tuple[int, str]:
20
  """Count tokens in text using the specified model encoding.
21
 
 
50
  return 0, 0, "No URL provided"
51
 
52
  try:
53
+ # Check cache first
54
+ current_time = time.time()
55
+ if url in _url_cache:
56
+ cached_entry = _url_cache[url]
57
+ if current_time - cached_entry["timestamp"] < CACHE_DURATION:
58
+ # Use cached content
59
+ html_content = cached_entry["html"]
60
+ markdown_content = cached_entry["markdown"]
61
+
62
+ # Count tokens for both
63
+ encoding = tiktoken.encoding_for_model(model)
64
+ html_tokens = len(encoding.encode(html_content))
65
+ markdown_tokens = len(encoding.encode(markdown_content))
66
+
67
+ cache_age = int(current_time - cached_entry["timestamp"])
68
+ status = f"✓ Fetched from cache ({cache_age}s old)\n"
69
+ status += f"HTML: {html_tokens} tokens ({len(html_content)} chars)\n"
70
+ status += f"Markdown: {markdown_tokens} tokens ({len(markdown_content)} chars)"
71
+
72
+ return html_tokens, markdown_tokens, status
73
+
74
+ # Cache miss or expired - fetch fresh content
75
  # Fetch as HTML
76
  html_response = requests.get(
77
  url,
 
90
  markdown_response.raise_for_status()
91
  markdown_content = markdown_response.text
92
 
93
+ # Update cache
94
+ _url_cache[url] = {
95
+ "html": html_content,
96
+ "markdown": markdown_content,
97
+ "timestamp": current_time
98
+ }
99
+
100
  # Count tokens for both
101
  encoding = tiktoken.encoding_for_model(model)
102
  html_tokens = len(encoding.encode(html_content))