Spaces:

linktimecloud
/

ask.py

Sleeping

App Files Files Community

linktimecloud commited on Oct 22, 2024

Commit

a228dd5

verified ·

1 Parent(s): 9bfb099

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.env.tpl +6 -0
.gitignore +164 -0
LICENSE +21 -0
README.md +194 -7
ask.py +618 -0
instructions/links.txt +3 -0
requirements.txt +9 -0

.env.tpl ADDED Viewed

	@@ -0,0 +1,6 @@

+# right now we use Google search API
+SEARCH_API_KEY=your-google-search-api-key
+SEARCH_PROJECT_KEY=your-google-cx-key
+# right now we use OpenAI API
+LLM_API_KEY=your-openai-api-key

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.gradio

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 pengfeng
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,199 @@
 ---
-title: Ask.py
-emoji: 🏃
-colorFrom: red
-colorTo: pink
 sdk: gradio
 sdk_version: 5.3.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ask.py
+app_file: ask.py
 sdk: gradio
 sdk_version: 5.3.0
 ---
+# ask.py
+[![License](https://img.shields.io/github/license/pengfeng/ask.py)](LICENSE)
+A single Python program to implement the search-extract-summarize flow, similar to AI search
+engines such as Perplexity.
+> [!NOTE]
+> Our main goal is to illustrate the basic concepts of AI search engines with the raw constructs.
+> Performance or scalability is not in the scope of this program.
+## The search-extract-summarize flow
+Given a query, the program will
+- search Google for the top 10 web pages
+- crawl and scape the pages for their text content
+- chunk the text content into chunks and save them into a vectordb
+- perform a vector search with the query and find the top 10 matched chunks
+- use the top 10 chunks as the context to ask an LLM to generate the answer
+- output the answer with the references
+Of course this flow is a very simplified version of the real AI search engines, but it is a good
+starting point to understand the basic concepts.
+One benefit is that we can manipulate the search function and output format.
+For example, we can:
+- search with date-restrict to only retrieve the latest information.
+- search within a target-site to only create the answer from the contents from it.
+- ask LLM to use a specific language to answer the question.
+- ask LLM to answer with a specific length.
+- crawl a specific list of urls and answer based on those contents only.
+## Quick start
+```bash
+pip install -r requirements.txt
+# modify .env file to set the API keys or export them as environment variables as below
+# right now we use Google search API
+export SEARCH_API_KEY="your-google-search-api-key"
+export SEARCH_PROJECT_KEY="your-google-cx-key"
+# right now we use OpenAI API
+export LLM_API_KEY="your-openai-api-key"
+# run the program
+python ask.py -q "What is an LLM agent?"
+# we can specify more parameters to control the behavior such as date_restrict and target_site
+python ask.py --help
+Usage: ask.py [OPTIONS]
+  Search web for the query and summarize the results
+Options:
+  -q, --query TEXT                Query to search  [required]
+  --url-list TEXT                 Instead of doing web search, scrape the
+                                  target URL list and answer the query based
+                                  on the content  [default:
+                                  instructions/links.txt]
+  -d, --date-restrict INTEGER     Restrict search results to a specific date
+                                  range, default is no restriction
+  -s, --target-site TEXT          Restrict search results to a specific site,
+                                  default is no restriction
+  --output-language TEXT          Output language for the answer
+  --output-length INTEGER         Output length for the answer
+  -m, --model-name TEXT           Model name to use for inference
+  -l, --log-level [DEBUG|INFO|WARNING|ERROR]
+                                  Set the logging level  [default: INFO]
+  --help                          Show this message and exit.
+```
+## Libraries and APIs used
+- [Google Search API](https://developers.google.com/custom-search/v1/overview)
+- [OpenAI API](https://beta.openai.com/docs/api-reference/completions/create)
+- [Jinja2](https://jinja.palletsprojects.com/en/3.0.x/)
+- [bs4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
+- [duckdb](https://github.com/duckdb/duckdb)
+## Sample output
+### General Search
+```
+% python ask.py -q "Why do we need agentic RAG even if we have ChatGPT?"
+✅ Found 10 links for query: Why do we need agentic RAG even if we have ChatGPT?
+✅ Scraping the URLs ...
+✅ Scraped 10 URLs ...
+✅ Chunking the text ...
+✅ Saving to vector DB ...
+✅ Querying the vector DB ...
+✅ Running inference with context ...
+# Answer
+Agentic RAG (Retrieval-Augmented Generation) is needed alongside ChatGPT for several reasons:
+1. **Precision and Contextual Relevance**: While ChatGPT offers generative responses, it may not
+reliably provide precise answers, especially when specific, accurate information is critical[5].
+Agentic RAG enhances this by integrating retrieval mechanisms that improve response context and
+accuracy, allowing users to access the most relevant and recent data without the need for costly
+model fine-tuning[2].
+2. **Customizability**: RAG allows businesses to create tailored chatbots that can securely
+reference company-specific data[2]. In contrast, ChatGPT’s broader capabilities may not be
+directly suited for specialized, domain-specific questions without comprehensive customization[3].
+3. **Complex Query Handling**: RAG can be optimized for complex queries and can be adjusted to
+work better with specific types of inputs, such as comparing and contrasting information, a task
+where ChatGPT may struggle under certain circumstances[9]. This level of customization can lead to
+better performance in niche applications where precise retrieval of information is crucial.
+4. **Asynchronous Processing Capabilities**: Future agentic systems aim to integrate asynchronous
+handling of actions, allowing for parallel processing and reducing wait times for retrieval and
+computation, which is a limitation in the current form of ChatGPT[7]. This advancement would enhance
+overall efficiency and responsiveness in conversations.
+5. **Incorporating Retrieved Information Effectively**: Using RAG can significantly improve how
+retrieved information is utilized within a conversation. By effectively managing the context and
+relevance of retrieved documents, RAG helps in framing prompts that can guide ChatGPT towards
+delivering more accurate responses[10].
+In summary, while ChatGPT excels in generating conversational responses, agentic RAG brings
+precision, customization, and efficiency that can significantly enhance the overall conversational
+AI experience.
+# References
+[1] https://community.openai.com/t/how-to-use-rag-properly-and-what-types-of-query-it-is-good-at/658204
+[2] https://www.linkedin.com/posts/brianjuliusdc_dax-powerbi-chatgpt-activity-7235953280177041408-wQqq
+[3] https://community.openai.com/t/how-to-use-rag-properly-and-what-types-of-query-it-is-good-at/658204
+[4] https://community.openai.com/t/prompt-engineering-for-rag/621495
+[5] https://www.ben-evans.com/benedictevans/2024/6/8/building-ai-products
+[6] https://community.openai.com/t/prompt-engineering-for-rag/621495
+[7] https://www.linkedin.com/posts/kurtcagle_agentic-rag-personalizing-and-optimizing-activity-7198097129993613312-z7Sm
+[8] https://community.openai.com/t/how-to-use-rag-properly-and-what-types-of-query-it-is-good-at/658204
+[9] https://community.openai.com/t/how-to-use-rag-properly-and-what-types-of-query-it-is-good-at/658204
+[10] https://community.openai.com/t/prompt-engineering-for-rag/621495
+```
+### Only use the latest information from a specific site
+This following query will only use the information from openai.com that are updated in the previous
+day. The behavior is similar to the "site:openai.com" and "date-restrict" search parameters in Google
+search.
+```
+% python ask.py -q "OpenAI Swarm Framework" -d 1 -s openai.com
+✅ Found 10 links for query: OpenAI Swarm Framework
+✅ Scraping the URLs ...
+✅ Scraped 10 URLs ...
+✅ Chunking the text ...
+✅ Saving to vector DB ...
+✅ Querying the vector DB to get context ...
+✅ Running inference with context ...
+# Answer
+OpenAI Swarm Framework is an experimental platform designed for building, orchestrating, and
+deploying multi-agent systems, enabling multiple AI agents to collaborate on complex tasks. It contrasts
+with traditional single-agent models by facilitating agent interaction and coordination, thus enhancing
+efficiency[5][9]. The framework provides developers with a way to orchestrate these agent systems in
+a lightweight manner, leveraging Node.js for scalable applications[1][4].
+One implementation of this framework is Swarm.js, which serves as a Node.js SDK, allowing users to
+create and manage agents that perform tasks and hand off conversations. Swarm.js is positioned as
+an educational tool, making it accessible for both beginners and experts, although it may still contain
+bugs and is currently lightweight[1][3][7]. This new approach emphasizes multi-agent collaboration and is
+well-suited for back-end development, requiring some programming expertise for effective implementation[9].
+Overall, OpenAI Swarm facilitates a shift in how AI systems can collaborate, differing from existing
+OpenAI tools by focusing on backend orchestration rather than user-interactive front-end applications[9].
+# References
+[1] https://community.openai.com/t/introducing-swarm-js-node-js-implementation-of-openai-swarm/977510
+[2] https://community.openai.com/t/introducing-swarm-js-a-node-js-implementation-of-openai-swarm/977510
+[3] https://community.openai.com/t/introducing-swarm-js-node-js-implementation-of-openai-swarm/977510
+[4] https://community.openai.com/t/introducing-swarm-js-a-node-js-implementation-of-openai-swarm/977510
+[5] https://community.openai.com/t/swarm-some-initial-insights/976602
+[6] https://community.openai.com/t/swarm-some-initial-insights/976602
+[7] https://community.openai.com/t/introducing-swarm-js-node-js-implementation-of-openai-swarm/977510
+[8] https://community.openai.com/t/introducing-swarm-js-a-node-js-implementation-of-openai-swarm/977510
+[9] https://community.openai.com/t/swarm-some-initial-insights/976602
+[10] https://community.openai.com/t/swarm-some-initial-insights/976602
+```

ask.py ADDED Viewed

	@@ -0,0 +1,618 @@

+import json
+import logging
+import os
+import urllib.parse
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple
+import click
+import duckdb
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+from jinja2 import BaseLoader, Environment
+from openai import OpenAI
+script_dir = os.path.dirname(os.path.abspath(__file__))
+default_env_file = os.path.abspath(os.path.join(script_dir, ".env"))
+def get_logger(log_level: str) -> logging.Logger:
+    logger = logging.getLogger(__name__)
+    logger.setLevel(log_level)
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger
+class Ask:
+    def __init__(self, logger: Optional[logging.Logger] = None):
+        self.read_env_variables()
+        if logger is not None:
+            self.logger = logger
+        else:
+            self.logger = get_logger("INFO")
+        self.table_name = "document_chunks"
+        self.db_con = duckdb.connect(":memory:")
+        self.db_con.install_extension("vss")
+        self.db_con.load_extension("vss")
+        self.db_con.install_extension("fts")
+        self.db_con.load_extension("fts")
+        self.db_con.sql("CREATE SEQUENCE seq_docid START 1000")
+        self.db_con.execute(
+            f"""
+CREATE TABLE {self.table_name} (
+    doc_id INTEGER PRIMARY KEY DEFAULT nextval('seq_docid'),
+    url TEXT,
+    chunk TEXT,
+    vec FLOAT[{self.embedding_dimensions}]
+);
+"""
+        )
+        self.session = requests.Session()
+        user_agent: str = (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
+        )
+        self.session.headers.update({"User-Agent": user_agent})
+    def read_env_variables(self) -> None:
+        err_msg = ""
+        self.search_api_key = os.environ.get("SEARCH_API_KEY")
+        if self.search_api_key is None:
+            err_msg += "SEARCH_API_KEY env variable not set.\n"
+        self.search_project_id = os.environ.get("SEARCH_PROJECT_KEY")
+        if self.search_project_id is None:
+            err_msg += "SEARCH_PROJECT_KEY env variable not set.\n"
+        self.llm_api_key = os.environ.get("LLM_API_KEY")
+        if self.llm_api_key is None:
+            err_msg += "LLM_API_KEY env variable not set.\n"
+        if err_msg != "":
+            raise Exception(f"\n{err_msg}\n")
+        self.llm_base_url = os.environ.get("LLM_BASE_URL")
+        if self.llm_base_url is None:
+            self.llm_base_url = "https://api.openai.com/v1"
+        self.embedding_model = os.environ.get("EMBEDDING_MODEL")
+        self.embedding_dimensions = os.environ.get("EMBEDDING_DIMENSIONS")
+        if self.embedding_model is None or self.embedding_dimensions is None:
+            self.embedding_model = "text-embedding-3-small"
+            self.embedding_dimensions = 1536
+    def search_web(self, query: str, date_restrict: int, target_site: str) -> List[str]:
+        escaped_query = urllib.parse.quote(query)
+        url_base = (
+            f"https://www.googleapis.com/customsearch/v1?key={self.search_api_key}"
+            f"&cx={self.search_project_id}&q={escaped_query}"
+        )
+        url_paras = f"&safe=active"
+        if date_restrict is not None and date_restrict > 0:
+            url_paras += f"&dateRestrict={date_restrict}"
+        if target_site is not None and target_site != "":
+            url_paras += f"&siteSearch={target_site}&siteSearchFilter=i"
+        url = f"{url_base}{url_paras}"
+        self.logger.debug(f"Searching for query: {query}")
+        resp = requests.get(url)
+        if resp is None:
+            raise Exception("No response from search API")
+        search_results_dict = json.loads(resp.text)
+        if "error" in search_results_dict:
+            raise Exception(
+                f"Error in search API response: {search_results_dict['error']}"
+            )
+        if "searchInformation" not in search_results_dict:
+            raise Exception(
+                f"No search information in search API response: {resp.text}"
+            )
+        total_results = search_results_dict["searchInformation"].get("totalResults", 0)
+        if total_results == 0:
+            self.logger.warning(f"No results found for query: {query}")
+            return []
+        results = search_results_dict.get("items", [])
+        if results is None or len(results) == 0:
+            self.logger.warning(f"No result items in the response for query: {query}")
+            return []
+        found_links = []
+        for result in results:
+            link = result.get("link", None)
+            if link is None or link == "":
+                self.logger.warning(f"Search result link missing: {result}")
+                continue
+            found_links.append(link)
+        return found_links
+    def _scape_url(self, url: str) -> Tuple[str, str]:
+        try:
+            response = self.session.get(url, timeout=10)
+            soup = BeautifulSoup(response.content, "lxml", from_encoding="utf-8")
+            body_tag = soup.body
+            if body_tag:
+                body_text = body_tag.get_text()
+                body_text = " ".join(body_text.split()).strip()
+                self.logger.debug(f"Scraped {url}: {body_text}...")
+                if len(body_text) > 100:
+                    return url, body_text
+                else:
+                    self.logger.warning(
+                        f"Body text too short for url: {url}, length: {len(body_text)}"
+                    )
+                    return url, ""
+            else:
+                self.logger.warning(f"No body tag found in the response for url: {url}")
+                return url, ""
+        except Exception as e:
+            self.logger.error(f"Scraping error {url}: {e}")
+            return url, ""
+    def scrape_urls(self, urls: List[str]) -> Dict[str, str]:
+        # the key is the url and the value is the body text
+        scrape_results: Dict[str, str] = {}
+        partial_scrape = partial(self._scape_url)
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            results = executor.map(partial_scrape, urls)
+        for url, body_text in results:
+            if body_text != "":
+                scrape_results[url] = body_text
+        return scrape_results
+    def chunk_results(
+        self, scrape_results: Dict[str, str], size: int, overlap: int
+    ) -> Dict[str, List[str]]:
+        chunking_results: Dict[str, List[str]] = {}
+        for url, text in scrape_results.items():
+            chunks = []
+            for pos in range(0, len(text), size - overlap):
+                chunks.append(text[pos : pos + size])
+            chunking_results[url] = chunks
+        return chunking_results
+    def get_embedding(self, client: OpenAI, texts: List[str]) -> List[List[float]]:
+        if len(texts) == 0:
+            return []
+        response = client.embeddings.create(input=texts, model=self.embedding_model)
+        embeddings = []
+        for i in range(len(response.data)):
+            embeddings.append(response.data[i].embedding)
+        return embeddings
+    def batch_get_embedding(
+        self, client: OpenAI, chunk_batch: Tuple[str, List[str]]
+    ) -> Tuple[Tuple[str, List[str]], List[List[float]]]:
+        """
+        Return the chunk_batch as well as the embeddings for each chunk so that
+        we can aggregate them and save them to the database together.
+        Args:
+        - client: OpenAI client
+        - chunk_batch: Tuple of URL and list of chunks scraped from the URL
+        Returns:
+        - Tuple of chunk_bach and list of result embeddings
+        """
+        texts = chunk_batch[1]
+        embeddings = self.get_embedding(client, texts)
+        return chunk_batch, embeddings
+    def save_to_db(self, chunking_results: Dict[str, List[str]]) -> None:
+        client = self._get_api_client()
+        embed_batch_size = 50
+        query_batch_size = 100
+        insert_data = []
+        batches: List[Tuple[str, List[str]]] = []
+        for url, list_chunks in chunking_results.items():
+            for i in range(0, len(list_chunks), embed_batch_size):
+                list_chunks = list_chunks[i : i + embed_batch_size]
+                batches.append((url, list_chunks))
+        self.logger.info(f"Embedding {len(batches)} batches of chunks ...")
+        partial_get_embedding = partial(self.batch_get_embedding, client)
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            all_embeddings = executor.map(partial_get_embedding, batches)
+        self.logger.info(f"✅ Finished embedding.")
+        for chunk_batch, embeddings in all_embeddings:
+            url = chunk_batch[0]
+            list_chunks = chunk_batch[1]
+            insert_data.extend(
+                [
+                    (url.replace("'", " "), chunk.replace("'", " "), embedding)
+                    for chunk, embedding in zip(list_chunks, embeddings)
+                ]
+            )
+        for i in range(0, len(insert_data), query_batch_size):
+            # insert the batch into DuckDB
+            value_str = ", ".join(
+                [
+                    f"('{url}', '{chunk}', {embedding})"
+                    for url, chunk, embedding in insert_data[i : i + embed_batch_size]
+                ]
+            )
+            query = f"""
+            INSERT INTO {self.table_name} (url, chunk, vec) VALUES {value_str};
+            """
+            self.db_con.execute(query)
+        self.db_con.execute(
+            f"""
+                CREATE INDEX cos_idx ON {self.table_name} USING HNSW (vec)
+                WITH (metric = 'cosine');
+            """
+        )
+        self.logger.info(f"✅ Created the vector index ...")
+        self.db_con.execute(
+            f"""
+                PRAGMA create_fts_index(
+                {self.table_name}, 'doc_id', 'chunk'
+                );
+            """
+        )
+        self.logger.info(f"✅ Created the full text search index ...")
+    def vector_search(self, query: str) -> List[Dict[str, Any]]:
+        client = self._get_api_client()
+        embeddings = self.get_embedding(client, [query])[0]
+        query_result: duckdb.DuckDBPyRelation = self.db_con.sql(
+            f"""
+            SELECT * FROM {self.table_name}
+            ORDER BY array_distance(vec, {embeddings}::FLOAT[{self.embedding_dimensions}])
+            LIMIT 10;
+        """
+        )
+        self.logger.debug(query_result)
+        matched_chunks = []
+        for record in query_result.fetchall():
+            result_record = {
+                "url": record[1],
+                "chunk": record[2],
+            }
+            matched_chunks.append(result_record)
+        return matched_chunks
+    def _get_api_client(self) -> OpenAI:
+        return OpenAI(api_key=self.llm_api_key, base_url=self.llm_base_url)
+    def _render_template(self, template_str: str, variables: Dict[str, Any]) -> str:
+        env = Environment(loader=BaseLoader(), autoescape=False)
+        template = env.from_string(template_str)
+        return template.render(variables)
+    def run_inference(
+        self,
+        query: str,
+        model_name: str,
+        matched_chunks: List[Dict[str, Any]],
+        output_language: str,
+        output_length: int,
+    ) -> str:
+        system_prompt = (
+            "You are an expert summarizing the answers based on the provided contents."
+        )
+        user_promt_template = """
+Given the context as a sequence of references with a reference id in the
+format of a leading [x], please answer the following question using {{ language }}:
+{{ query }}
+In the answer, use format [1], [2], ..., [n] in line where the reference is used.
+For example, "According to the research from Google[3], ...".
+Please create the answer strictly related to the context. If the context has no
+information about the query, please write "No related information found in the context."
+using {{ language }}.
+{{ length_instructions }}
+Here is the context:
+{{ context }}
+"""
+        context = ""
+        for i, chunk in enumerate(matched_chunks):
+            context += f"[{i+1}] {chunk['chunk']}\n"
+        if output_length is None or output_length == 0:
+            length_instructions = ""
+        else:
+            length_instructions = (
+                f"Please provide the answer in { output_length } words."
+            )
+        user_prompt = self._render_template(
+            user_promt_template,
+            {
+                "query": query,
+                "context": context,
+                "language": output_language,
+                "length_instructions": length_instructions,
+            },
+        )
+        self.logger.debug(f"Running inference with model: {model_name}")
+        self.logger.debug(f"Final user prompt: {user_prompt}")
+        api_client = self._get_api_client()
+        completion = api_client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {
+                    "role": "system",
+                    "content": system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": user_prompt,
+                },
+            ],
+        )
+        if completion is None:
+            raise Exception("No completion from the API")
+        response_str = completion.choices[0].message.content
+        return response_str
+def _read_url_list(url_list_file: str) -> str:
+    if url_list_file is None:
+        return None
+    with open(url_list_file, "r") as f:
+        links = f.readlines()
+    links = [
+        link.strip()
+        for link in links
+        if link.strip() != "" and not link.startswith("#")
+    ]
+    return "\n".join(links)
+def _run_query(
+    query: str,
+    date_restrict: int,
+    target_site: str,
+    output_language: str,
+    output_length: int,
+    url_list_str: str,
+    model_name: str,
+    log_level: str,
+) -> str:
+    logger = get_logger(log_level)
+    load_dotenv(dotenv_path=default_env_file, override=False)
+    ask = Ask(logger=logger)
+    if url_list_str is None or url_list_str.strip() == "":
+        logger.info("Searching the web ...")
+        links = ask.search_web(query, date_restrict, target_site)
+        logger.info(f"✅ Found {len(links)} links for query: {query}")
+        for i, link in enumerate(links):
+            logger.debug(f"{i+1}. {link}")
+    else:
+        links = url_list_str.split("\n")
+    logger.info("Scraping the URLs ...")
+    scrape_results = ask.scrape_urls(links)
+    logger.info(f"✅ Scraped {len(scrape_results)} URLs.")
+    logger.info("Chunking the text ...")
+    chunking_results = ask.chunk_results(scrape_results, 1000, 100)
+    total_chunks = 0
+    for url, chunks in chunking_results.items():
+        logger.debug(f"URL: {url}")
+        total_chunks += len(chunks)
+        for i, chunk in enumerate(chunks):
+            logger.debug(f"Chunk {i+1}: {chunk}")
+    logger.info(f"✅ Generated {total_chunks} chunks ...")
+    logger.info(f"Saving {total_chunks} chunks to DB ...")
+    ask.save_to_db(chunking_results)
+    logger.info(f"✅ Successfully embedded and saved chunks to DB.")
+    logger.info("Querying the vector DB to get context ...")
+    matched_chunks = ask.vector_search(query)
+    for i, result in enumerate(matched_chunks):
+        logger.debug(f"{i+1}. {result}")
+    logger.info(f"✅ Got {len(matched_chunks)} matched chunks.")
+    logger.info("Running inference with context ...")
+    answer = ask.run_inference(
+        query=query,
+        model_name=model_name,
+        matched_chunks=matched_chunks,
+        output_language=output_language,
+        output_length=output_length,
+    )
+    logger.info("✅ Finished inference API call.")
+    logger.info("generateing output ...")
+    answer = f"# Answer\n\n{answer}\n"
+    references = "\n".join(
+        [f"[{i+1}] {result['url']}" for i, result in enumerate(matched_chunks)]
+    )
+    return f"{answer}\n\n# References\n\n{references}"
+def launch_gradio(
+    query: str,
+    date_restrict: int,
+    target_site: str,
+    output_language: str,
+    output_length: int,
+    url_list_str: str,
+    model_name: str,
+    log_level: str,
+) -> None:
+    iface = gr.Interface(
+        fn=_run_query,
+        inputs=[
+            gr.Textbox(label="Query", value=query),
+            gr.Number(
+                label="Date Restrict (Optional) [0 or empty means no date limit.]",
+                value=date_restrict,
+            ),
+            gr.Textbox(
+                label="Target Sites (Optional) [Empty means seach the whole web.]",
+                value=target_site,
+            ),
+            gr.Textbox(
+                label="Output Language (Optional) [Default is English.]",
+                value=output_language,
+            ),
+            gr.Number(
+                label="Output Length in words (Optional) [Default is automatically decided by LLM.]",
+                value=output_length,
+            ),
+            gr.Textbox(
+                label="URL List (Optional) [When specified, scrape the urls instead of searching the web.]",
+                lines=5,
+                max_lines=20,
+                value=url_list_str,
+            ),
+        ],
+        additional_inputs=[
+            gr.Textbox(label="Model Name", value=model_name),
+            gr.Textbox(label="Log Level", value=log_level),
+        ],
+        outputs="text",
+        show_progress=True,
+        flagging_options=[("Report Error", None)],
+        title="Ask.py - Web Search-Extract-Summarize",
+        description="Search the web with the query and summarize the results. Source code: https://github.com/pengfeng/ask.py",
+    )
+    iface.launch()
+@click.command(help="Search web for the query and summarize the results")
+@click.option(
+    "--web-ui",
+    is_flag=True,
+    help="Launch the web interface",
+)
+@click.option("--query", "-q", required=False, help="Query to search")
+@click.option(
+    "--date-restrict",
+    "-d",
+    type=int,
+    required=False,
+    default=None,
+    help="Restrict search results to a specific date range, default is no restriction",
+)
+@click.option(
+    "--target-site",
+    "-s",
+    required=False,
+    default=None,
+    help="Restrict search results to a specific site, default is no restriction",
+)
+@click.option(
+    "--output-language",
+    required=False,
+    default="English",
+    help="Output language for the answer",
+)
+@click.option(
+    "--output-length",
+    type=int,
+    required=False,
+    default=None,
+    help="Output length for the answer",
+)
+@click.option(
+    "--url-list-file",
+    type=str,
+    required=False,
+    default=None,
+    show_default=True,
+    help="Instead of doing web search, scrape the target URL list and answer the query based on the content",
+)
+@click.option(
+    "--model-name",
+    "-m",
+    required=False,
+    default="gpt-4o-mini",
+    help="Model name to use for inference",
+)
+@click.option(
+    "-l",
+    "--log-level",
+    "log_level",
+    default="INFO",
+    type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False),
+    help="Set the logging level",
+    show_default=True,
+)
+def search_extract_summarize(
+    web_ui: bool,
+    query: str,
+    date_restrict: int,
+    target_site: str,
+    output_language: str,
+    output_length: int,
+    url_list_file: str,
+    model_name: str,
+    log_level: str,
+):
+    if web_ui:
+        launch_gradio(
+            query=query,
+            date_restrict=date_restrict,
+            target_site=target_site,
+            output_language=output_language,
+            output_length=output_length,
+            url_list_str=_read_url_list(url_list_file),
+            model_name=model_name,
+            log_level=log_level,
+        )
+    else:
+        if query is None:
+            raise Exception("Query is required for the command line mode")
+        result = _run_query(
+            query=query,
+            date_restrict=date_restrict,
+            target_site=target_site,
+            output_language=output_language,
+            output_length=output_length,
+            url_list_str=_read_url_list(url_list_file),
+            model_name=model_name,
+            log_level=log_level,
+        )
+        click.echo(result)
+if __name__ == "__main__":
+    search_extract_summarize()

instructions/links.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+# we will crawl these pages and answer the question based on their contents
+https://en.wikipedia.org/wiki/Large_language_model
+https://en.wikipedia.org/wiki/Retrieval-augmented_generation

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+click==8.1.7
+requests==2.31.0
+openai==1.40.2
+jinja2==3.1.3
+bs4==0.0.2
+lxml==4.8.0
+python-dotenv==1.0.1
+duckdb==1.1.2
+gradio==5.3.0