Spaces:
Sleeping
Sleeping
Commit ·
58c1383
1
Parent(s): dfd9c92
Url cache
Browse files
app.py
CHANGED
|
@@ -4,13 +4,18 @@ import asyncio
|
|
| 4 |
import json
|
| 5 |
import tiktoken
|
| 6 |
import requests
|
| 7 |
-
|
|
|
|
| 8 |
from dataclasses import dataclass
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
|
| 11 |
# Load environment variables
|
| 12 |
load_dotenv()
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def count_tokens(text: str, model: str) -> Tuple[int, str]:
|
| 15 |
"""Count tokens in text using the specified model encoding.
|
| 16 |
|
|
@@ -45,6 +50,28 @@ def count_tokens_from_url(url: str, model: str) -> Tuple[int, int, str]:
|
|
| 45 |
return 0, 0, "No URL provided"
|
| 46 |
|
| 47 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# Fetch as HTML
|
| 49 |
html_response = requests.get(
|
| 50 |
url,
|
|
@@ -63,6 +90,13 @@ def count_tokens_from_url(url: str, model: str) -> Tuple[int, int, str]:
|
|
| 63 |
markdown_response.raise_for_status()
|
| 64 |
markdown_content = markdown_response.text
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# Count tokens for both
|
| 67 |
encoding = tiktoken.encoding_for_model(model)
|
| 68 |
html_tokens = len(encoding.encode(html_content))
|
|
|
|
| 4 |
import json
|
| 5 |
import tiktoken
|
| 6 |
import requests
|
| 7 |
+
import time
|
| 8 |
+
from typing import List, Tuple, Optional, Dict
|
| 9 |
from dataclasses import dataclass
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
|
| 12 |
# Load environment variables
|
| 13 |
load_dotenv()
|
| 14 |
|
| 15 |
+
# URL response cache: {url: {"html": str, "markdown": str, "timestamp": float}}
|
| 16 |
+
_url_cache: Dict[str, Dict] = {}
|
| 17 |
+
CACHE_DURATION = 900 # 15 minutes in seconds
|
| 18 |
+
|
| 19 |
def count_tokens(text: str, model: str) -> Tuple[int, str]:
|
| 20 |
"""Count tokens in text using the specified model encoding.
|
| 21 |
|
|
|
|
| 50 |
return 0, 0, "No URL provided"
|
| 51 |
|
| 52 |
try:
|
| 53 |
+
# Check cache first
|
| 54 |
+
current_time = time.time()
|
| 55 |
+
if url in _url_cache:
|
| 56 |
+
cached_entry = _url_cache[url]
|
| 57 |
+
if current_time - cached_entry["timestamp"] < CACHE_DURATION:
|
| 58 |
+
# Use cached content
|
| 59 |
+
html_content = cached_entry["html"]
|
| 60 |
+
markdown_content = cached_entry["markdown"]
|
| 61 |
+
|
| 62 |
+
# Count tokens for both
|
| 63 |
+
encoding = tiktoken.encoding_for_model(model)
|
| 64 |
+
html_tokens = len(encoding.encode(html_content))
|
| 65 |
+
markdown_tokens = len(encoding.encode(markdown_content))
|
| 66 |
+
|
| 67 |
+
cache_age = int(current_time - cached_entry["timestamp"])
|
| 68 |
+
status = f"✓ Fetched from cache ({cache_age}s old)\n"
|
| 69 |
+
status += f"HTML: {html_tokens} tokens ({len(html_content)} chars)\n"
|
| 70 |
+
status += f"Markdown: {markdown_tokens} tokens ({len(markdown_content)} chars)"
|
| 71 |
+
|
| 72 |
+
return html_tokens, markdown_tokens, status
|
| 73 |
+
|
| 74 |
+
# Cache miss or expired - fetch fresh content
|
| 75 |
# Fetch as HTML
|
| 76 |
html_response = requests.get(
|
| 77 |
url,
|
|
|
|
| 90 |
markdown_response.raise_for_status()
|
| 91 |
markdown_content = markdown_response.text
|
| 92 |
|
| 93 |
+
# Update cache
|
| 94 |
+
_url_cache[url] = {
|
| 95 |
+
"html": html_content,
|
| 96 |
+
"markdown": markdown_content,
|
| 97 |
+
"timestamp": current_time
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
# Count tokens for both
|
| 101 |
encoding = tiktoken.encoding_for_model(model)
|
| 102 |
html_tokens = len(encoding.encode(html_content))
|