itsOwen commited on
Commit
224cf2e
·
1 Parent(s): 3ba9a4d

regex added to filter webpage data and reduce prompt cost and improve speed of respones

Browse files
Files changed (2) hide show
  1. main.py +1 -1
  2. src/web_extractor.py +23 -15
main.py CHANGED
@@ -180,7 +180,7 @@ def main():
180
  for chat_id, chat_data in chats:
181
  messages = chat_data['messages']
182
  if messages:
183
- button_label = f"🗨️ {messages[0]['content'][:25]}..."
184
  else:
185
  button_label = "🗨️ Empty Chat"
186
 
 
180
  for chat_id, chat_data in chats:
181
  messages = chat_data['messages']
182
  if messages:
183
+ button_label = f"{messages[0]['content'][:25]}..."
184
  else:
185
  button_label = "🗨️ Empty Chat"
186
 
src/web_extractor.py CHANGED
@@ -3,6 +3,7 @@ from typing import Dict, Any, Optional, List
3
  import json
4
  import pandas as pd
5
  from io import BytesIO
 
6
  from .models import Models
7
  from .scrapers import PlaywrightScraper, HTMLScraper, JSONScraper
8
  from .utils.proxy_manager import ProxyManager
@@ -11,6 +12,7 @@ from langchain.prompts import PromptTemplate
11
  from langchain.schema.runnable import RunnableSequence
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  import tiktoken
 
14
 
15
  class WebExtractor:
16
  def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
@@ -23,6 +25,7 @@ class WebExtractor:
23
  self.markdown_formatter = MarkdownFormatter()
24
  self.current_url = None
25
  self.current_content = None
 
26
  self.conversation_history: List[str] = []
27
  self.text_splitter = RecursiveCharacterTextSplitter(
28
  chunk_size=32000,
@@ -53,15 +56,24 @@ class WebExtractor:
53
  self.current_url = url
54
  proxy = await self.proxy_manager.get_proxy()
55
  self.current_content = await self.playwright_scraper.fetch_content(self.current_url, proxy)
56
- return f"I've fetched the content from {self.current_url}. What would you like to know about it?"
 
 
 
 
 
 
 
 
 
57
 
58
  async def _extract_info(self, query: str) -> str:
59
- content_tokens = self.num_tokens_from_string(self.current_content)
60
 
61
  extraction_prompt = PromptTemplate(
62
  input_variables=["webpage_content", "query"],
63
  template="""You are an AI assistant that helps with web scraping tasks.
64
- Based on the following webpage content and the user's request, extract the relevant information.
65
  Present the data in a structured format as specified by the user's query:
66
  - If the user asks for JSON, respond with a JSON array of objects.
67
  - If the user asks for CSV, respond with CSV data (including headers).
@@ -76,7 +88,7 @@ class WebExtractor:
76
  If the user asks for all extractable data, provide all entries you can find.
77
  Ensure that the extracted data accurately reflects the content of the webpage.
78
 
79
- Webpage content:
80
  {webpage_content}
81
 
82
  Human: {query}
@@ -85,16 +97,15 @@ class WebExtractor:
85
 
86
  if content_tokens <= self.max_tokens - 1000:
87
  chain = RunnableSequence(extraction_prompt | self.model)
88
- response = await chain.ainvoke({"webpage_content": self.current_content, "query": query})
89
  extracted_data = response.content
90
  else:
91
- chunks = self.text_splitter.split_text(self.current_content)
92
  all_extracted_data = []
93
  for chunk in chunks:
94
  chain = RunnableSequence(extraction_prompt | self.model)
95
  response = await chain.ainvoke({"webpage_content": chunk, "query": query})
96
  all_extracted_data.append(response.content)
97
-
98
  extracted_data = "\n".join(all_extracted_data)
99
 
100
  if 'json' in query.lower():
@@ -106,8 +117,12 @@ class WebExtractor:
106
  else:
107
  return self._format_as_text(extracted_data)
108
 
 
 
 
109
  def _format_as_json(self, data: str) -> str:
110
  return data
 
111
  def _format_as_csv(self, data: str) -> str:
112
  return data
113
 
@@ -115,18 +130,13 @@ class WebExtractor:
115
  try:
116
  lines = data.strip().split('\n')
117
  rows = [line.split('|') for line in lines if line.strip()]
118
-
119
  df = pd.DataFrame(rows[1:], columns=[col.strip() for col in rows[0]])
120
-
121
  output_filename = "output.xlsx"
122
-
123
  with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
124
  df.to_excel(writer, index=False)
125
-
126
  return f"Excel data saved to {output_filename}"
127
-
128
  except Exception as e:
129
- return f"Error: Unable to convert to Excel format. {str(e)}. Raw data: {data[:500]}..." # Limit raw data preview
130
 
131
  def _format_as_text(self, data: str) -> str:
132
  try:
@@ -138,10 +148,8 @@ class WebExtractor:
138
  async def save_data(self, filename: str) -> str:
139
  if not self.current_content:
140
  return "No data to save. Please fetch a webpage first."
141
-
142
  with open(filename, 'w', encoding='utf-8') as f:
143
  f.write(self.current_content)
144
-
145
  return f"Data saved to {filename}"
146
 
147
  def format_to_markdown(self, text: str) -> str:
 
3
  import json
4
  import pandas as pd
5
  from io import BytesIO
6
+ import re
7
  from .models import Models
8
  from .scrapers import PlaywrightScraper, HTMLScraper, JSONScraper
9
  from .utils.proxy_manager import ProxyManager
 
12
  from langchain.schema.runnable import RunnableSequence
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  import tiktoken
15
+ import time
16
 
17
  class WebExtractor:
18
  def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
 
25
  self.markdown_formatter = MarkdownFormatter()
26
  self.current_url = None
27
  self.current_content = None
28
+ self.preprocessed_content = None
29
  self.conversation_history: List[str] = []
30
  self.text_splitter = RecursiveCharacterTextSplitter(
31
  chunk_size=32000,
 
56
  self.current_url = url
57
  proxy = await self.proxy_manager.get_proxy()
58
  self.current_content = await self.playwright_scraper.fetch_content(self.current_url, proxy)
59
+ self.preprocessed_content = self._preprocess_content(self.current_content)
60
+ return f"I've fetched and preprocessed the content from {self.current_url}. What would you like to know about it?"
61
+
62
+ def _preprocess_content(self, content: str) -> str:
63
+ content = re.sub(r'<script\b[^>]*>[\s\S]*?</script>', '', content)
64
+ content = re.sub(r'<style\b[^>]*>[\s\S]*?</style>', '', content)
65
+ content = re.sub(r'<!--[\s\S]*?-->', '', content)
66
+ content = re.sub(r'<(?!/?(?:table|tr|th|td|thead|tbody|ul|ol|li|p|h[1-6]|br|hr)[>\s])\/?[^>]*>', '', content)
67
+ content = re.sub(r'\s+', ' ', content)
68
+ return content.strip()
69
 
70
  async def _extract_info(self, query: str) -> str:
71
+ content_tokens = self.num_tokens_from_string(self.preprocessed_content)
72
 
73
  extraction_prompt = PromptTemplate(
74
  input_variables=["webpage_content", "query"],
75
  template="""You are an AI assistant that helps with web scraping tasks.
76
+ Based on the following preprocessed webpage content and the user's request, extract the relevant information.
77
  Present the data in a structured format as specified by the user's query:
78
  - If the user asks for JSON, respond with a JSON array of objects.
79
  - If the user asks for CSV, respond with CSV data (including headers).
 
88
  If the user asks for all extractable data, provide all entries you can find.
89
  Ensure that the extracted data accurately reflects the content of the webpage.
90
 
91
+ Preprocessed webpage content:
92
  {webpage_content}
93
 
94
  Human: {query}
 
97
 
98
  if content_tokens <= self.max_tokens - 1000:
99
  chain = RunnableSequence(extraction_prompt | self.model)
100
+ response = await chain.ainvoke({"webpage_content": self.preprocessed_content, "query": query})
101
  extracted_data = response.content
102
  else:
103
+ chunks = self.optimized_text_splitter(self.preprocessed_content)
104
  all_extracted_data = []
105
  for chunk in chunks:
106
  chain = RunnableSequence(extraction_prompt | self.model)
107
  response = await chain.ainvoke({"webpage_content": chunk, "query": query})
108
  all_extracted_data.append(response.content)
 
109
  extracted_data = "\n".join(all_extracted_data)
110
 
111
  if 'json' in query.lower():
 
117
  else:
118
  return self._format_as_text(extracted_data)
119
 
120
+ def optimized_text_splitter(self, text: str) -> List[str]:
121
+ return self.text_splitter.split_text(text)
122
+
123
  def _format_as_json(self, data: str) -> str:
124
  return data
125
+
126
  def _format_as_csv(self, data: str) -> str:
127
  return data
128
 
 
130
  try:
131
  lines = data.strip().split('\n')
132
  rows = [line.split('|') for line in lines if line.strip()]
 
133
  df = pd.DataFrame(rows[1:], columns=[col.strip() for col in rows[0]])
 
134
  output_filename = "output.xlsx"
 
135
  with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
136
  df.to_excel(writer, index=False)
 
137
  return f"Excel data saved to {output_filename}"
 
138
  except Exception as e:
139
+ return f"Error: Unable to convert to Excel format. {str(e)}. Raw data: {data[:500]}..."
140
 
141
  def _format_as_text(self, data: str) -> str:
142
  try:
 
148
  async def save_data(self, filename: str) -> str:
149
  if not self.current_content:
150
  return "No data to save. Please fetch a webpage first."
 
151
  with open(filename, 'w', encoding='utf-8') as f:
152
  f.write(self.current_content)
 
153
  return f"Data saved to {filename}"
154
 
155
  def format_to_markdown(self, text: str) -> str: