Spaces:
Running
Running
Update brave.py
Browse files
brave.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
import requests
|
| 4 |
from langchain_community.document_loaders import WebBaseLoader
|
| 5 |
-
from
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
import re
|
| 8 |
import time
|
|
@@ -15,9 +15,9 @@ load_dotenv()
|
|
| 15 |
# Initialize API clients
|
| 16 |
BRAVE_API_KEY = os.getenv("BRAVE_API_KEY")
|
| 17 |
BRAVE_SEARCH_URL = "https://api.search.brave.com/res/v1/news/search"
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
|
| 22 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 23 |
def clean_content(content):
|
|
@@ -25,11 +25,22 @@ def clean_content(content):
|
|
| 25 |
soup = BeautifulSoup(content, 'html.parser')
|
| 26 |
|
| 27 |
# Remove unwanted elements
|
| 28 |
-
for element in soup(['header', 'footer', 'nav', 'aside']):
|
| 29 |
element.decompose()
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Remove extra spaces and newlines
|
| 35 |
text = re.sub(r'\s+', ' ', text).strip()
|
|
@@ -39,7 +50,7 @@ def clean_content(content):
|
|
| 39 |
|
| 40 |
return text
|
| 41 |
|
| 42 |
-
|
| 43 |
def summarize_content(content, max_tokens=4000):
|
| 44 |
summarization_prompt = f"""Summarize the following content, preserving important details, facts, and figures. This summary will be used for research and news purposes, so accuracy and comprehensiveness are crucial. Keep the summary within approximately {max_tokens} tokens.
|
| 45 |
|
|
@@ -49,21 +60,21 @@ def summarize_content(content, max_tokens=4000):
|
|
| 49 |
Summary:"""
|
| 50 |
|
| 51 |
try:
|
| 52 |
-
|
|
|
|
| 53 |
messages=[
|
| 54 |
{"role": "system", "content": "You are an expert summarizer, capable of condensing information while retaining crucial details."},
|
| 55 |
{"role": "user", "content": summarization_prompt}
|
| 56 |
],
|
| 57 |
-
|
| 58 |
-
max_tokens=max_tokens,
|
| 59 |
)
|
| 60 |
|
| 61 |
-
summary =
|
| 62 |
if not summary.strip():
|
| 63 |
-
raise ValueError("Empty summary received from
|
| 64 |
return summary
|
| 65 |
except Exception as e:
|
| 66 |
-
raise ValueError(f"Error in
|
| 67 |
|
| 68 |
def perform_web_search(query, num_results=2):
|
| 69 |
headers = {
|
|
@@ -122,6 +133,7 @@ def perform_web_search(query, num_results=2):
|
|
| 122 |
def load_web_content(urls):
|
| 123 |
loader = WebBaseLoader(urls)
|
| 124 |
documents = loader.load()
|
|
|
|
| 125 |
cleaned_contents = []
|
| 126 |
summarized_contents = []
|
| 127 |
|
|
@@ -132,7 +144,13 @@ def load_web_content(urls):
|
|
| 132 |
print(f"Cleaned content for URL {i+1}:")
|
| 133 |
print(cleaned_content[:500] + "..." if len(cleaned_content) > 500 else cleaned_content)
|
| 134 |
print("\n" + "-"*50 + "\n")
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
summarized_content = summarize_content(cleaned_content)
|
| 137 |
summarized_contents.append(summarized_content)
|
| 138 |
print(f"Summarized content for URL {i+1}:")
|
|
@@ -140,6 +158,9 @@ def load_web_content(urls):
|
|
| 140 |
print("\n" + "-"*50 + "\n")
|
| 141 |
except Exception as e:
|
| 142 |
print(f"Error processing content for URL {i+1}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
if not summarized_contents:
|
| 145 |
print("Error: No content could be processed")
|
|
@@ -160,19 +181,19 @@ def generate_detailed_explanation(query, context):
|
|
| 160 |
Explanation:"""
|
| 161 |
|
| 162 |
try:
|
| 163 |
-
|
|
|
|
| 164 |
messages=[
|
| 165 |
{"role": "system", "content": "You are a knowledgeable assistant that provides good and easy to understand explanations on various topics, incorporating all relevant information from the given context."},
|
| 166 |
{"role": "user", "content": prompt}
|
| 167 |
],
|
| 168 |
-
|
| 169 |
-
max_tokens=7000, # Reduced to stay within the 8000 token limit
|
| 170 |
)
|
| 171 |
|
| 172 |
-
explanation =
|
| 173 |
if not explanation.strip():
|
| 174 |
-
print("Error: Empty explanation received from
|
| 175 |
-
raise ValueError("Empty explanation received from
|
| 176 |
return explanation
|
| 177 |
except Exception as e:
|
| 178 |
print(f"Error in generate_detailed_explanation: {str(e)}")
|
|
@@ -193,4 +214,3 @@ def main():
|
|
| 193 |
|
| 194 |
if __name__ == "__main__":
|
| 195 |
main()
|
| 196 |
-
|
|
|
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
import requests
|
| 4 |
from langchain_community.document_loaders import WebBaseLoader
|
| 5 |
+
from openai import OpenAI
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
import re
|
| 8 |
import time
|
|
|
|
| 15 |
# Initialize API clients
|
| 16 |
BRAVE_API_KEY = os.getenv("BRAVE_API_KEY")
|
| 17 |
BRAVE_SEARCH_URL = "https://api.search.brave.com/res/v1/news/search"
|
| 18 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 19 |
|
| 20 |
+
openai_client = OpenAI(api_key=OPENAI_API_KEY)
|
| 21 |
|
| 22 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 23 |
def clean_content(content):
|
|
|
|
| 25 |
soup = BeautifulSoup(content, 'html.parser')
|
| 26 |
|
| 27 |
# Remove unwanted elements
|
| 28 |
+
for element in soup(['header', 'footer', 'nav', 'aside', 'menu']):
|
| 29 |
element.decompose()
|
| 30 |
|
| 31 |
+
# Try to find the main content
|
| 32 |
+
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
| 33 |
+
|
| 34 |
+
if main_content:
|
| 35 |
+
# If a main content area is found, use that
|
| 36 |
+
text = main_content.get_text()
|
| 37 |
+
else:
|
| 38 |
+
# If no main content area is found, use the body
|
| 39 |
+
body = soup.find('body')
|
| 40 |
+
if body:
|
| 41 |
+
text = body.get_text()
|
| 42 |
+
else:
|
| 43 |
+
text = soup.get_text()
|
| 44 |
|
| 45 |
# Remove extra spaces and newlines
|
| 46 |
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
| 50 |
|
| 51 |
return text
|
| 52 |
|
| 53 |
+
|
| 54 |
def summarize_content(content, max_tokens=4000):
|
| 55 |
summarization_prompt = f"""Summarize the following content, preserving important details, facts, and figures. This summary will be used for research and news purposes, so accuracy and comprehensiveness are crucial. Keep the summary within approximately {max_tokens} tokens.
|
| 56 |
|
|
|
|
| 60 |
Summary:"""
|
| 61 |
|
| 62 |
try:
|
| 63 |
+
response = openai_client.chat.completions.create(
|
| 64 |
+
model="gpt-4o-mini",
|
| 65 |
messages=[
|
| 66 |
{"role": "system", "content": "You are an expert summarizer, capable of condensing information while retaining crucial details."},
|
| 67 |
{"role": "user", "content": summarization_prompt}
|
| 68 |
],
|
| 69 |
+
max_tokens=max_tokens
|
|
|
|
| 70 |
)
|
| 71 |
|
| 72 |
+
summary = response.choices[0].message.content
|
| 73 |
if not summary.strip():
|
| 74 |
+
raise ValueError("Empty summary received from OpenAI")
|
| 75 |
return summary
|
| 76 |
except Exception as e:
|
| 77 |
+
raise ValueError(f"Error in OpenAI API call: {str(e)}")
|
| 78 |
|
| 79 |
def perform_web_search(query, num_results=2):
|
| 80 |
headers = {
|
|
|
|
| 133 |
def load_web_content(urls):
|
| 134 |
loader = WebBaseLoader(urls)
|
| 135 |
documents = loader.load()
|
| 136 |
+
print('Documents: ', documents)
|
| 137 |
cleaned_contents = []
|
| 138 |
summarized_contents = []
|
| 139 |
|
|
|
|
| 144 |
print(f"Cleaned content for URL {i+1}:")
|
| 145 |
print(cleaned_content[:500] + "..." if len(cleaned_content) > 500 else cleaned_content)
|
| 146 |
print("\n" + "-"*50 + "\n")
|
| 147 |
+
|
| 148 |
+
print('Cleaned content: ', cleaned_content)
|
| 149 |
+
print('-'*50)
|
| 150 |
+
print(len(cleaned_content))
|
| 151 |
+
cleaned_content = cleaned_content.replace('\n', ' ')
|
| 152 |
+
cleaned_content = cleaned_content.replace('\t', ' ')
|
| 153 |
+
cleaned_content = cleaned_content[:1000]
|
| 154 |
summarized_content = summarize_content(cleaned_content)
|
| 155 |
summarized_contents.append(summarized_content)
|
| 156 |
print(f"Summarized content for URL {i+1}:")
|
|
|
|
| 158 |
print("\n" + "-"*50 + "\n")
|
| 159 |
except Exception as e:
|
| 160 |
print(f"Error processing content for URL {i+1}: {str(e)}")
|
| 161 |
+
print(f"Full error details: {repr(e)}")
|
| 162 |
+
print(f"URL: {urls[i]}")
|
| 163 |
+
print("Skipping this URL and continuing with the next one.")
|
| 164 |
|
| 165 |
if not summarized_contents:
|
| 166 |
print("Error: No content could be processed")
|
|
|
|
| 181 |
Explanation:"""
|
| 182 |
|
| 183 |
try:
|
| 184 |
+
response = openai_client.chat.completions.create(
|
| 185 |
+
model="gpt-4o-mini",
|
| 186 |
messages=[
|
| 187 |
{"role": "system", "content": "You are a knowledgeable assistant that provides good and easy to understand explanations on various topics, incorporating all relevant information from the given context."},
|
| 188 |
{"role": "user", "content": prompt}
|
| 189 |
],
|
| 190 |
+
max_tokens=4096 # Adjust as needed
|
|
|
|
| 191 |
)
|
| 192 |
|
| 193 |
+
explanation = response.choices[0].message.content
|
| 194 |
if not explanation.strip():
|
| 195 |
+
print("Error: Empty explanation received from OpenAI")
|
| 196 |
+
raise ValueError("Empty explanation received from OpenAI")
|
| 197 |
return explanation
|
| 198 |
except Exception as e:
|
| 199 |
print(f"Error in generate_detailed_explanation: {str(e)}")
|
|
|
|
| 214 |
|
| 215 |
if __name__ == "__main__":
|
| 216 |
main()
|
|
|