Spaces:

lagerbaer
/

secsplorer

Runtime error

App Files Files Community

lagerbaer commited on Jan 9, 2024

Commit

99e964c

1 Parent(s): d409029

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

modal_gradio_test.py +29 -9
modal_script.py +131 -133
simple_script.py +4 -95
update_vector_db.py +97 -0

modal_gradio_test.py CHANGED Viewed

@@ -3,8 +3,12 @@ from modal import Stub, Image, asgi_app
 from fastapi import FastAPI
-image = Image.debian_slim("3.11").pip_install(
-    "gradio",
 )
 stub = Stub("secsplorer", image=image)
@@ -12,15 +16,31 @@ stub = Stub("secsplorer", image=image)
 web_app = FastAPI()
-@stub.function()
 @asgi_app()
 def fastapi_app():
     import gradio as gr
     from gradio.routes import mount_gradio_app
-    def chat_function(message, history):
-        yield "Foo!"
-    interface = gr.ChatInterface(chat_function)
-    return mount_gradio_app(app=web_app, blocks=interface, path="/")

 from fastapi import FastAPI
+image = (
+    Image.debian_slim()
+    .run_commands(["pip install --upgrade pip"])
+    .pip_install(
+        "gradio==3.50.2",
+    )
 )
 stub = Stub("secsplorer", image=image)
 web_app = FastAPI()
+@stub.function(concurrency_limit=1)
 @asgi_app()
 def fastapi_app():
     import gradio as gr
     from gradio.routes import mount_gradio_app
+    import gradio as gr
+    import random
+    import time
+    with gr.Blocks() as demo:
+        chatbot = gr.Chatbot()
+        msg = gr.Textbox()
+        clear = gr.ClearButton([msg, chatbot])
+        def respond(message, chat_history):
+            print("Calling respond...")
+            bot_message = random.choice(
+                ["How are you?", "I love you", "I'm very hungry"]
+            )
+            chat_history.append((message, bot_message))
+            time.sleep(2)
+            print("Returning result...")
+            return "", chat_history
+        msg.submit(respond, [msg, chatbot], [msg, chatbot])
+    return mount_gradio_app(app=web_app, blocks=demo, path="/")

modal_script.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Dict
 image = Image.debian_slim("3.11").pip_install(
     "cohere",
-    "gradio",
     "pinecone-client",
 )
@@ -28,144 +28,142 @@ def fastapi_app():
     import gradio as gr
     from gradio.routes import mount_gradio_app
-    # print("Connecting to cohere client")
-    # co = cohere.Client(os.environ["COHERE_API_KEY"])
-    # print("Done")
-    # # pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment="gcp-starter")
-    # # index = pinecone.Index(index_name="td-sec-embeddings")
-    # index = None
-    # def retrieve(
-    #     index: pinecone.Index, query: str, co: cohere.Client
-    # ) -> List[Dict[str, str]]:
-    #     """
-    #     Retrieves documents based on the given query.
-    #     Parameters:
-    #     query (str): The query to retrieve documents for.
-    #     Returns:
-    #     List[Dict[str, str]]: A list of dictionaries representing the retrieved  documents, with 'title', 'snippet', and 'url' keys.
-    #     """
-    #     docs_retrieved = []
-    #     print(f"Calling retrieve for '{query}'")
-    #     print("Embedding the query")
-    #     query_emb = co.embed(
-    #         texts=[query], model="embed-english-v3.0", input_type="search_query"
-    #     ).embeddings
-    #     print("Querying pinecone")
-    #     res = index.query(query_emb, top_k=10, include_metadata=True)
-    #     print("Preparing to rerank")
-    #     docs_to_rerank = [match["metadata"] for match in res["matches"]]
-    #     rerank_results = co.rerank(
-    #         query=query,
-    #         documents=docs_to_rerank,
-    #         top_n=3,
-    #         model="rerank-english-v2.0",
-    #     )
-    #     docs_retrieved = []
-    #     for hit in rerank_results:
-    #         docs_retrieved.append(docs_to_rerank[hit.index])
-    #     print("Returning retrieved docs")
-    #     return docs_retrieved
-    # class Chatbot:
-    #     def __init__(self, co: cohere.Client, index: pinecone.Index):
-    #         self.index = index
-    #         self.conversation_id = str(uuid.uuid4())
-    #         self.co = co
-    #     def generate_response(self, message: str):
-    #         """
-    #         Generates a response to the user's message.
-    #         Parameters:
-    #         message (str): The user's message.
-    #         Yields:
-    #         Event: A response event generated by the chatbot.
-    #         Returns:
-    #         List[Dict[str, str]]: A list of dictionaries representing the retrieved documents.
-    #         """
-    #         # Generate search queries (if any)
-    #         response = self.co.chat(message=message, search_queries_only=True)
-    #         # If there are search queries, retrieve documents and respond
-    #         if response.search_queries:
-    #             print("Retrieving information")
-    #             documents = self.retrieve_docs(response)
-    #             response = self.co.chat(
-    #                 message=message,
-    #                 documents=documents,
-    #                 conversation_id=self.conversation_id,
-    #                 stream=True,
-    #             )
-    #             for event in response:
-    #                 yield event
-    #         # If there is no search query, directly respond
-    #         else:
-    #             response = self.co.chat(
-    #                 message=message, conversation_id=self.conversation_id, stream=True
-    #             )
-    #             for event in response:
-    #                 yield event
-    #     def retrieve_docs(self, response) -> List[Dict[str, str]]:
-    #         """
-    #         Retrieves documents based on the search queries in the response.
-    #         Parameters:
-    #         response: The response object containing search queries.
-    #         Returns:
-    #         List[Dict[str, str]]: A list of dictionaries representing the retrieved documents.
-    #         """
-    #         # Get the query(s)
-    #         queries = []
-    #         for search_query in response.search_queries:
-    #             queries.append(search_query["text"])
-    #         # Retrieve documents for each query
-    #         retrieved_docs = []
-    #         for query in queries:
-    #             retrieved_docs.extend(retrieve(self.index, query, self.co))
-    #         return retrieved_docs
-    # chatbot = Chatbot(co, index)
     def chat_function(message, history):
-        return "Foo!"
-        # flag = False
-        # reply = ""
-        # for event in chatbot.generate_response(message):
-        #     if event.event_type == "text-generation":
-        #         reply += str(event.text)
-        #         yield reply
-        #     # Citations
-        #     if event.event_type == "citation-generation":
-        #         if not flag:
-        #             reply += "\n\nCITATIONS:\n\n"
-        #             yield reply
-        #             flag = True
-        #         reply += str(event.citations) + "\n"
-        #         yield reply
-    interface = gr.ChatInterface(chat_function)
     print("All ready!")
     return mount_gradio_app(app=web_app, blocks=interface, path="/")

 image = Image.debian_slim("3.11").pip_install(
     "cohere",
+    "gradio==3.50.2",
     "pinecone-client",
 )
     import gradio as gr
     from gradio.routes import mount_gradio_app
+    print("Connecting to cohere client")
+    co = cohere.Client(os.environ["COHERE_API_KEY"])
+    print("Done")
+    pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp")
+    index = pinecone.Index(index_name="td-sec-embeddings")
+    def retrieve(
+        index: pinecone.Index, query: str, co: cohere.Client
+    ) -> List[Dict[str, str]]:
+        """
+        Retrieves documents based on the given query.
+        Parameters:
+        query (str): The query to retrieve documents for.
+        Returns:
+        List[Dict[str, str]]: A list of dictionaries representing the retrieved  documents, with 'title', 'snippet', and 'url' keys.
+        """
+        docs_retrieved = []
+        print(f"Calling retrieve for '{query}'")
+        print("Embedding the query")
+        query_emb = co.embed(
+            texts=[query], model="embed-english-v3.0", input_type="search_query"
+        ).embeddings
+        print("Querying pinecone")
+        res = index.query(query_emb, top_k=10, include_metadata=True)
+        print("Preparing to rerank")
+        docs_to_rerank = [match["metadata"] for match in res["matches"]]
+        rerank_results = co.rerank(
+            query=query,
+            documents=docs_to_rerank,
+            top_n=3,
+            model="rerank-english-v2.0",
+        )
+        docs_retrieved = []
+        for hit in rerank_results:
+            docs_retrieved.append(docs_to_rerank[hit.index])
+        print("Returning retrieved docs")
+        return docs_retrieved
+    class Chatbot:
+        def __init__(self, co: cohere.Client, index: pinecone.Index):
+            self.index = index
+            self.conversation_id = str(uuid.uuid4())
+            self.co = co
+        def generate_response(self, message: str):
+            """
+            Generates a response to the user's message.
+            Parameters:
+            message (str): The user's message.
+            Yields:
+            Event: A response event generated by the chatbot.
+            Returns:
+            List[Dict[str, str]]: A list of dictionaries representing the retrieved documents.
+            """
+            # Generate search queries (if any)
+            response = self.co.chat(message=message, search_queries_only=True)
+            # If there are search queries, retrieve documents and respond
+            if response.search_queries:
+                print("Retrieving information")
+                documents = self.retrieve_docs(response)
+                response = self.co.chat(
+                    message=message,
+                    documents=documents,
+                    conversation_id=self.conversation_id,
+                    stream=True,
+                )
+                for event in response:
+                    yield event
+            # If there is no search query, directly respond
+            else:
+                response = self.co.chat(
+                    message=message, conversation_id=self.conversation_id, stream=True
+                )
+                for event in response:
+                    yield event
+        def retrieve_docs(self, response) -> List[Dict[str, str]]:
+            """
+            Retrieves documents based on the search queries in the response.
+            Parameters:
+            response: The response object containing search queries.
+            Returns:
+            List[Dict[str, str]]: A list of dictionaries representing the retrieved documents.
+            """
+            # Get the query(s)
+            queries = []
+            for search_query in response.search_queries:
+                queries.append(search_query["text"])
+            # Retrieve documents for each query
+            retrieved_docs = []
+            for query in queries:
+                retrieved_docs.extend(retrieve(self.index, query, self.co))
+            return retrieved_docs
+    chatbot = Chatbot(co, index)
     def chat_function(message, history):
+        flag = False
+        reply = ""
+        for event in chatbot.generate_response(message):
+            if event.event_type == "text-generation":
+                reply += str(event.text)
+                yield reply
+            # Citations
+            if event.event_type == "citation-generation":
+                if not flag:
+                    reply += "\n\nCITATIONS:\n\n"
+                    yield reply
+                    flag = True
+                reply += str(event.citations) + "\n"
+                yield reply
+    interface = gr.ChatInterface(chat_function).queue()
     print("All ready!")
     return mount_gradio_app(app=web_app, blocks=interface, path="/")

simple_script.py CHANGED Viewed

@@ -4,9 +4,6 @@ import pinecone
 import uuid
 from typing import List, Dict
-# from unstructured.chunking.title import chunk_by_title
-# from unstructured.partition.pdf import partition_pdf
 from dotenv import load_dotenv
@@ -14,87 +11,10 @@ load_dotenv()
 co = cohere.Client(os.environ["COHERE_API_KEY"])
-pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment="gcp-starter")
 index = pinecone.Index("td-sec-embeddings")
-from typing import List, Dict
-# from unstructured.partition.pdf import partition_pdf
-# from unstructured.chunking.title import chunk_by_title
-import cohere
-sources = [
-    {
-        "title": "2023",
-        "url": "https://www.td.com/content/dam/tdcom/canada/about-td/pdf/quarterly-results/2023/2023-annual-report-e.pdf",
-        "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf",
-    },
-    # {
-    #     "title": "2022",
-    #     "url": "https://www.td.com/document/PDF/ar2022/ar2022-Complete-Report.pdf",
-    #     "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf",
-    # },
-]
-def load() -> List[Dict[str, str]]:
-    """
-    Loads the documents from the sources and chunks the HTML content.
-    """
-    print("Loading documents...")
-    docs = []
-    for source in sources:
-        elements = partition_pdf(filename=source["filename"])
-        chunks = chunk_by_title(elements)
-        for chunk in chunks:
-            docs.append(
-                {
-                    "title": source["title"],
-                    "text": str(chunk),
-                    "url": source["url"],
-                }
-            )
-    return docs
-def embed(docs: List[Dict[str, str]]) -> List[List[float]]:
-    """
-    Embeds the documents using the Cohere API.
-    """
-    print("Embedding documents...")
-    batch_size = 90
-    docs_len = len(docs)
-    docs_embs = []
-    for i in range(0, docs_len, batch_size):
-        batch = docs[i : min(i + batch_size, docs_len)]
-        texts = [item["text"] for item in batch]
-        docs_embs_batch = co.embed(
-            texts=texts, model="embed-english-v3.0", input_type="search_document"
-        ).embeddings
-        docs_embs.extend(docs_embs_batch)
-    return docs_embs
-def update_index(
-    index: pinecone.Index, docs: List[Dict[str, str]], docs_embs: List[List[float]]
-) -> None:
-    """
-    Indexes the documents for efficient retrieval.
-    """
-    batch_size = 100
-    ids = [str(i) for i in range(len(docs))]
-    to_upsert = list(zip(ids, docs_embs, docs))
-    for i in range(0, len(docs), batch_size):
-        i_end = min(i + batch_size, len(docs))
-        index.upsert(vectors=to_upsert[i:i_end])
 def retrieve(index: pinecone.Index, query: str) -> List[Dict[str, str]]:
     """
@@ -108,21 +28,18 @@ def retrieve(index: pinecone.Index, query: str) -> List[Dict[str, str]]:
     """
     docs_retrieved = []
-    print(f"Calling retrieve for '{query}'")
-    print("Embedding the query")
     query_emb = co.embed(
         texts=[query], model="embed-english-v3.0", input_type="search_query"
     ).embeddings
-    print("Querying pinecone")
     res = index.query(query_emb, top_k=100, include_metadata=True)
-    print("Preparing to rerank")
     docs_to_rerank = [match["metadata"] for match in res["matches"]]
     rerank_results = co.rerank(
         query=query,
         documents=docs_to_rerank,
-        top_n=10,
         model="rerank-english-v2.0",
     )
@@ -130,15 +47,9 @@ def retrieve(index: pinecone.Index, query: str) -> List[Dict[str, str]]:
     for hit in rerank_results:
         docs_retrieved.append(docs_to_rerank[hit.index])
-    print("Returning retrieved docs")
     return docs_retrieved
-# docs = load()
-# docs_embeds = embed(docs)
-# update_index(index, docs=docs, docs_embs=docs_embeds)
 class Chatbot:
     def __init__(self, co: cohere.Client, index: pinecone.Index):
         self.index = index
@@ -168,7 +79,7 @@ class Chatbot:
             print("Retrieving information...")
             documents = self.retrieve_docs(response)
-            print(f"Generating response with documents {documents}")
             response = self.co.chat(
                 message=message,
                 documents=documents,
@@ -198,11 +109,9 @@ class Chatbot:
         """
         # Get the query(s)
-        print("Calling retrieve_docs")
         queries = []
         for search_query in response.search_queries:
             queries.append(search_query["text"])
-        print(queries)
         # Retrieve documents for each query
         retrieved_docs = []

 import uuid
 from typing import List, Dict
 from dotenv import load_dotenv
 co = cohere.Client(os.environ["COHERE_API_KEY"])
+pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp")
 index = pinecone.Index("td-sec-embeddings")
 def retrieve(index: pinecone.Index, query: str) -> List[Dict[str, str]]:
     """
     """
     docs_retrieved = []
     query_emb = co.embed(
         texts=[query], model="embed-english-v3.0", input_type="search_query"
     ).embeddings
     res = index.query(query_emb, top_k=100, include_metadata=True)
     docs_to_rerank = [match["metadata"] for match in res["matches"]]
     rerank_results = co.rerank(
         query=query,
         documents=docs_to_rerank,
+        top_n=3,
         model="rerank-english-v2.0",
     )
     for hit in rerank_results:
         docs_retrieved.append(docs_to_rerank[hit.index])
     return docs_retrieved
 class Chatbot:
     def __init__(self, co: cohere.Client, index: pinecone.Index):
         self.index = index
             print("Retrieving information...")
             documents = self.retrieve_docs(response)
             response = self.co.chat(
                 message=message,
                 documents=documents,
         """
         # Get the query(s)
         queries = []
         for search_query in response.search_queries:
             queries.append(search_query["text"])
         # Retrieve documents for each query
         retrieved_docs = []

update_vector_db.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import cohere
+import os
+import pinecone
+from typing import List, Dict
+from unstructured.chunking.title import chunk_by_title
+from unstructured.partition.pdf import partition_pdf
+from dotenv import load_dotenv
+load_dotenv()
+co = cohere.Client(os.environ["COHERE_API_KEY"])
+pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp")
+index = pinecone.Index("td-sec-embeddings")
+from typing import List, Dict
+sources = [
+    {
+        "title": "2023",
+        "url": "https://www.td.com/content/dam/tdcom/canada/about-td/pdf/quarterly-results/2023/2023-annual-report-e.pdf",
+        "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf",
+    },
+    {
+        "title": "2022",
+        "url": "https://www.td.com/document/PDF/ar2022/ar2022-Complete-Report.pdf",
+        "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf",
+    },
+]
+def load() -> List[Dict[str, str]]:
+    """
+    Loads the documents from the sources and chunks the HTML content.
+    """
+    print("Loading documents...")
+    docs = []
+    for source in sources:
+        elements = partition_pdf(filename=source["filename"])
+        chunks = chunk_by_title(elements)
+        for chunk in chunks:
+            docs.append(
+                {
+                    "title": source["title"],
+                    "text": str(chunk),
+                    "url": source["url"],
+                }
+            )
+    return docs
+def embed(docs: List[Dict[str, str]]) -> List[List[float]]:
+    """
+    Embeds the documents using the Cohere API.
+    """
+    print("Embedding documents...")
+    batch_size = 90
+    docs_len = len(docs)
+    docs_embs = []
+    for i in range(0, docs_len, batch_size):
+        batch = docs[i : min(i + batch_size, docs_len)]
+        texts = [item["text"] for item in batch]
+        docs_embs_batch = co.embed(
+            texts=texts, model="embed-english-v3.0", input_type="search_document"
+        ).embeddings
+        docs_embs.extend(docs_embs_batch)
+    return docs_embs
+def update_index(
+    index: pinecone.Index, docs: List[Dict[str, str]], docs_embs: List[List[float]]
+) -> None:
+    """
+    Indexes the documents for efficient retrieval.
+    """
+    print("Indexing documents in Pinecone")
+    batch_size = 100
+    ids = [str(i) for i in range(len(docs))]
+    to_upsert = list(zip(ids, docs_embs, docs))
+    for i in range(0, len(docs), batch_size):
+        i_end = min(i + batch_size, len(docs))
+        index.upsert(vectors=to_upsert[i:i_end])
+if __name__ == "__main__":
+    docs = load()
+    docs_embeds = embed(docs)
+    update_index(index, docs=docs, docs_embs=docs_embeds)