Sentinel-AI-Web-Search-Test-v2-Testing-Score

Build error

App Files Files Community

Shreyas094 commited on Jun 21, 2024

Commit

a2335c5

verified ·

1 Parent(s): 753d9d8

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -17

app.py CHANGED Viewed

@@ -1,30 +1,135 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-# Path to the locally saved quantized model directory
-model_path = '/path/to/your/quantized_model_directory'
-# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Load quantized model
-quantized_model = AutoModelForCausalLM.from_pretrained(model_path)
-# Check if a GPU is available and move model to GPU if available
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-quantized_model.to(device)
-# Example text input
-text_input = "How did Tesla perform in Q1 2024?"
-# Tokenize input
-inputs = tokenizer(text_input, return_tensors="pt").to(device)
-# Generate response
-outputs = quantized_model.generate(**inputs, max_length=150, do_sample=False)
-# Decode generated tokens to readable string
 response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Print generated response
-print(f"Generated response: {response}")

+import random
+import requests
+from bs4 import BeautifulSoup
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import login
 import torch
+import os
+# Ensure sentencepiece is installed
+try:
+    import sentencepiece
+except ImportError:
+    raise ImportError("Please install the sentencepiece library using `pip install sentencepiece`.")
+# Retrieve the Hugging Face token from secrets (replace 'HUGGINGFACE_TOKEN' with your secret key)
+hf_token = os.getenv('HUGGINGFACE_TOKEN')
+# Log in to Hugging Face
+login(token=hf_token)
+# List of user agents
+_useragent_list = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
+]
+# Function to extract visible text from HTML content of a webpage
+def extract_text_from_webpage(html):
+    print("Extracting text from webpage...")
+    soup = BeautifulSoup(html, 'html.parser')
+    for script in soup(["script", "style"]):
+        script.extract()  # Remove scripts and styles
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = '\n'.join(chunk for chunk in chunks if chunk)
+    print(f"Extracted text length: {len(text)}")
+    return text
+# Function to perform a Google search and retrieve results
+def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
+    """Performs a Google search and returns the results."""
+    print(f"Searching for term: {term}")
+    escaped_term = requests.utils.quote(term)
+    start = 0
+    all_results = []
+    max_chars_per_page = 8000  # Limit the number of characters from each webpage to stay under the token limit
+    with requests.Session() as session:
+        while start < num_results:
+            print(f"Fetching search results starting from: {start}")
+            try:
+                # Choose a random user agent
+                user_agent = random.choice(_useragent_list)
+                headers = {
+                    'User-Agent': user_agent
+                }
+                print(f"Using User-Agent: {headers['User-Agent']}")
+                resp = session.get(
+                    url="https://www.google.com/search",
+                    headers=headers,
+                    params={
+                        "q": term,
+                        "num": num_results - start,
+                        "hl": lang,
+                        "start": start,
+                        "safe": safe,
+                    },
+                    timeout=timeout,
+                    verify=ssl_verify,
+                )
+                resp.raise_for_status()
+            except requests.exceptions.RequestException as e:
+                print(f"Error fetching search results: {e}")
+                break
+            soup = BeautifulSoup(resp.text, "html.parser")
+            result_block = soup.find_all("div", attrs={"class": "g"})
+            if not result_block:
+                print("No more results found.")
+                break
+            for result in result_block:
+                link = result.find("a", href=True)
+                if link:
+                    link = link["href"]
+                    print(f"Found link: {link}")
+                    try:
+                        webpage = session.get(link, headers=headers, timeout=timeout)
+                        webpage.raise_for_status()
+                        visible_text = extract_text_from_webpage(webpage.text)
+                        if len(visible_text) > max_chars_per_page:
+                            visible_text = visible_text[:max_chars_per_page] + "..."
+                        all_results.append({"link": link, "text": visible_text})
+                    except requests.exceptions.RequestException as e:
+                        print(f"Error fetching or processing {link}: {e}")
+                        all_results.append({"link": link, "text": None})
+                else:
+                    print("No link found in result.")
+                    all_results.append({"link": None, "text": None})
+            start += len(result_block)
+    print(f"Total results fetched: {len(all_results)}")
+    return all_results
+# Load the Mixtral-8x7B-Instruct model and tokenizer
+model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Check if a GPU is available and if not, fall back to CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check for GPU
+model.to(device)  # Move model to the device
+# Example usage
+search_term = "How did Tesla perform in Q1 2024"
+search_results = google_search(search_term, num_results=3)
+# Combine text from search results to create a prompt
+combined_text = "\n\n".join(result['text'] for result in search_results if result['text'])
+# Tokenize the input text
+inputs = tokenizer(combined_text, return_tensors="pt").to(device)  # Move inputs to the device
+# Generate a response
+outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)
+# Decode the generated tokens to a readable string
 response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Print the response
+print(response)