Spaces:
Paused
Paused
itsOwen commited on
Commit ·
224cf2e
1
Parent(s): 3ba9a4d
regex added to filter webpage data and reduce prompt cost and improve speed of respones
Browse files- main.py +1 -1
- src/web_extractor.py +23 -15
main.py
CHANGED
|
@@ -180,7 +180,7 @@ def main():
|
|
| 180 |
for chat_id, chat_data in chats:
|
| 181 |
messages = chat_data['messages']
|
| 182 |
if messages:
|
| 183 |
-
button_label = f"
|
| 184 |
else:
|
| 185 |
button_label = "🗨️ Empty Chat"
|
| 186 |
|
|
|
|
| 180 |
for chat_id, chat_data in chats:
|
| 181 |
messages = chat_data['messages']
|
| 182 |
if messages:
|
| 183 |
+
button_label = f"{messages[0]['content'][:25]}..."
|
| 184 |
else:
|
| 185 |
button_label = "🗨️ Empty Chat"
|
| 186 |
|
src/web_extractor.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Dict, Any, Optional, List
|
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
from io import BytesIO
|
|
|
|
| 6 |
from .models import Models
|
| 7 |
from .scrapers import PlaywrightScraper, HTMLScraper, JSONScraper
|
| 8 |
from .utils.proxy_manager import ProxyManager
|
|
@@ -11,6 +12,7 @@ from langchain.prompts import PromptTemplate
|
|
| 11 |
from langchain.schema.runnable import RunnableSequence
|
| 12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 13 |
import tiktoken
|
|
|
|
| 14 |
|
| 15 |
class WebExtractor:
|
| 16 |
def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
|
|
@@ -23,6 +25,7 @@ class WebExtractor:
|
|
| 23 |
self.markdown_formatter = MarkdownFormatter()
|
| 24 |
self.current_url = None
|
| 25 |
self.current_content = None
|
|
|
|
| 26 |
self.conversation_history: List[str] = []
|
| 27 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 28 |
chunk_size=32000,
|
|
@@ -53,15 +56,24 @@ class WebExtractor:
|
|
| 53 |
self.current_url = url
|
| 54 |
proxy = await self.proxy_manager.get_proxy()
|
| 55 |
self.current_content = await self.playwright_scraper.fetch_content(self.current_url, proxy)
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
async def _extract_info(self, query: str) -> str:
|
| 59 |
-
content_tokens = self.num_tokens_from_string(self.
|
| 60 |
|
| 61 |
extraction_prompt = PromptTemplate(
|
| 62 |
input_variables=["webpage_content", "query"],
|
| 63 |
template="""You are an AI assistant that helps with web scraping tasks.
|
| 64 |
-
Based on the following webpage content and the user's request, extract the relevant information.
|
| 65 |
Present the data in a structured format as specified by the user's query:
|
| 66 |
- If the user asks for JSON, respond with a JSON array of objects.
|
| 67 |
- If the user asks for CSV, respond with CSV data (including headers).
|
|
@@ -76,7 +88,7 @@ class WebExtractor:
|
|
| 76 |
If the user asks for all extractable data, provide all entries you can find.
|
| 77 |
Ensure that the extracted data accurately reflects the content of the webpage.
|
| 78 |
|
| 79 |
-
|
| 80 |
{webpage_content}
|
| 81 |
|
| 82 |
Human: {query}
|
|
@@ -85,16 +97,15 @@ class WebExtractor:
|
|
| 85 |
|
| 86 |
if content_tokens <= self.max_tokens - 1000:
|
| 87 |
chain = RunnableSequence(extraction_prompt | self.model)
|
| 88 |
-
response = await chain.ainvoke({"webpage_content": self.
|
| 89 |
extracted_data = response.content
|
| 90 |
else:
|
| 91 |
-
chunks = self.
|
| 92 |
all_extracted_data = []
|
| 93 |
for chunk in chunks:
|
| 94 |
chain = RunnableSequence(extraction_prompt | self.model)
|
| 95 |
response = await chain.ainvoke({"webpage_content": chunk, "query": query})
|
| 96 |
all_extracted_data.append(response.content)
|
| 97 |
-
|
| 98 |
extracted_data = "\n".join(all_extracted_data)
|
| 99 |
|
| 100 |
if 'json' in query.lower():
|
|
@@ -106,8 +117,12 @@ class WebExtractor:
|
|
| 106 |
else:
|
| 107 |
return self._format_as_text(extracted_data)
|
| 108 |
|
|
|
|
|
|
|
|
|
|
| 109 |
def _format_as_json(self, data: str) -> str:
|
| 110 |
return data
|
|
|
|
| 111 |
def _format_as_csv(self, data: str) -> str:
|
| 112 |
return data
|
| 113 |
|
|
@@ -115,18 +130,13 @@ class WebExtractor:
|
|
| 115 |
try:
|
| 116 |
lines = data.strip().split('\n')
|
| 117 |
rows = [line.split('|') for line in lines if line.strip()]
|
| 118 |
-
|
| 119 |
df = pd.DataFrame(rows[1:], columns=[col.strip() for col in rows[0]])
|
| 120 |
-
|
| 121 |
output_filename = "output.xlsx"
|
| 122 |
-
|
| 123 |
with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
|
| 124 |
df.to_excel(writer, index=False)
|
| 125 |
-
|
| 126 |
return f"Excel data saved to {output_filename}"
|
| 127 |
-
|
| 128 |
except Exception as e:
|
| 129 |
-
return f"Error: Unable to convert to Excel format. {str(e)}. Raw data: {data[:500]}..."
|
| 130 |
|
| 131 |
def _format_as_text(self, data: str) -> str:
|
| 132 |
try:
|
|
@@ -138,10 +148,8 @@ class WebExtractor:
|
|
| 138 |
async def save_data(self, filename: str) -> str:
|
| 139 |
if not self.current_content:
|
| 140 |
return "No data to save. Please fetch a webpage first."
|
| 141 |
-
|
| 142 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 143 |
f.write(self.current_content)
|
| 144 |
-
|
| 145 |
return f"Data saved to {filename}"
|
| 146 |
|
| 147 |
def format_to_markdown(self, text: str) -> str:
|
|
|
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
from io import BytesIO
|
| 6 |
+
import re
|
| 7 |
from .models import Models
|
| 8 |
from .scrapers import PlaywrightScraper, HTMLScraper, JSONScraper
|
| 9 |
from .utils.proxy_manager import ProxyManager
|
|
|
|
| 12 |
from langchain.schema.runnable import RunnableSequence
|
| 13 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 14 |
import tiktoken
|
| 15 |
+
import time
|
| 16 |
|
| 17 |
class WebExtractor:
|
| 18 |
def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
|
|
|
|
| 25 |
self.markdown_formatter = MarkdownFormatter()
|
| 26 |
self.current_url = None
|
| 27 |
self.current_content = None
|
| 28 |
+
self.preprocessed_content = None
|
| 29 |
self.conversation_history: List[str] = []
|
| 30 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 31 |
chunk_size=32000,
|
|
|
|
| 56 |
self.current_url = url
|
| 57 |
proxy = await self.proxy_manager.get_proxy()
|
| 58 |
self.current_content = await self.playwright_scraper.fetch_content(self.current_url, proxy)
|
| 59 |
+
self.preprocessed_content = self._preprocess_content(self.current_content)
|
| 60 |
+
return f"I've fetched and preprocessed the content from {self.current_url}. What would you like to know about it?"
|
| 61 |
+
|
| 62 |
+
def _preprocess_content(self, content: str) -> str:
|
| 63 |
+
content = re.sub(r'<script\b[^>]*>[\s\S]*?</script>', '', content)
|
| 64 |
+
content = re.sub(r'<style\b[^>]*>[\s\S]*?</style>', '', content)
|
| 65 |
+
content = re.sub(r'<!--[\s\S]*?-->', '', content)
|
| 66 |
+
content = re.sub(r'<(?!/?(?:table|tr|th|td|thead|tbody|ul|ol|li|p|h[1-6]|br|hr)[>\s])\/?[^>]*>', '', content)
|
| 67 |
+
content = re.sub(r'\s+', ' ', content)
|
| 68 |
+
return content.strip()
|
| 69 |
|
| 70 |
async def _extract_info(self, query: str) -> str:
|
| 71 |
+
content_tokens = self.num_tokens_from_string(self.preprocessed_content)
|
| 72 |
|
| 73 |
extraction_prompt = PromptTemplate(
|
| 74 |
input_variables=["webpage_content", "query"],
|
| 75 |
template="""You are an AI assistant that helps with web scraping tasks.
|
| 76 |
+
Based on the following preprocessed webpage content and the user's request, extract the relevant information.
|
| 77 |
Present the data in a structured format as specified by the user's query:
|
| 78 |
- If the user asks for JSON, respond with a JSON array of objects.
|
| 79 |
- If the user asks for CSV, respond with CSV data (including headers).
|
|
|
|
| 88 |
If the user asks for all extractable data, provide all entries you can find.
|
| 89 |
Ensure that the extracted data accurately reflects the content of the webpage.
|
| 90 |
|
| 91 |
+
Preprocessed webpage content:
|
| 92 |
{webpage_content}
|
| 93 |
|
| 94 |
Human: {query}
|
|
|
|
| 97 |
|
| 98 |
if content_tokens <= self.max_tokens - 1000:
|
| 99 |
chain = RunnableSequence(extraction_prompt | self.model)
|
| 100 |
+
response = await chain.ainvoke({"webpage_content": self.preprocessed_content, "query": query})
|
| 101 |
extracted_data = response.content
|
| 102 |
else:
|
| 103 |
+
chunks = self.optimized_text_splitter(self.preprocessed_content)
|
| 104 |
all_extracted_data = []
|
| 105 |
for chunk in chunks:
|
| 106 |
chain = RunnableSequence(extraction_prompt | self.model)
|
| 107 |
response = await chain.ainvoke({"webpage_content": chunk, "query": query})
|
| 108 |
all_extracted_data.append(response.content)
|
|
|
|
| 109 |
extracted_data = "\n".join(all_extracted_data)
|
| 110 |
|
| 111 |
if 'json' in query.lower():
|
|
|
|
| 117 |
else:
|
| 118 |
return self._format_as_text(extracted_data)
|
| 119 |
|
| 120 |
+
def optimized_text_splitter(self, text: str) -> List[str]:
|
| 121 |
+
return self.text_splitter.split_text(text)
|
| 122 |
+
|
| 123 |
def _format_as_json(self, data: str) -> str:
|
| 124 |
return data
|
| 125 |
+
|
| 126 |
def _format_as_csv(self, data: str) -> str:
|
| 127 |
return data
|
| 128 |
|
|
|
|
| 130 |
try:
|
| 131 |
lines = data.strip().split('\n')
|
| 132 |
rows = [line.split('|') for line in lines if line.strip()]
|
|
|
|
| 133 |
df = pd.DataFrame(rows[1:], columns=[col.strip() for col in rows[0]])
|
|
|
|
| 134 |
output_filename = "output.xlsx"
|
|
|
|
| 135 |
with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
|
| 136 |
df.to_excel(writer, index=False)
|
|
|
|
| 137 |
return f"Excel data saved to {output_filename}"
|
|
|
|
| 138 |
except Exception as e:
|
| 139 |
+
return f"Error: Unable to convert to Excel format. {str(e)}. Raw data: {data[:500]}..."
|
| 140 |
|
| 141 |
def _format_as_text(self, data: str) -> str:
|
| 142 |
try:
|
|
|
|
| 148 |
async def save_data(self, filename: str) -> str:
|
| 149 |
if not self.current_content:
|
| 150 |
return "No data to save. Please fetch a webpage first."
|
|
|
|
| 151 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 152 |
f.write(self.current_content)
|
|
|
|
| 153 |
return f"Data saved to {filename}"
|
| 154 |
|
| 155 |
def format_to_markdown(self, text: str) -> str:
|