Spaces:

jpangas
/

gradio-extractor

Build error

App Files Files Community

jpangas commited on Jan 21, 2025

Commit

15f76b2

verified ·

1 Parent(s): de8ed04

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -87

app.py CHANGED Viewed

@@ -13,103 +13,105 @@ from langgraph.graph import START, StateGraph
 from typing_extensions import List, TypedDict
 import xmltodict
-qa_graph = None
-current_file = None
-class State(TypedDict):
-    question: str
-    context: List[Document]
-    answer: str
-def get_extra_docs(file_name):
-    # TODO: Add the code to extract the title, authors and abstract from the PDF file
-    client = GrobidClient(config_path="./config.json")
-    information = client.process_pdf(
-        "processHeaderDocument",
-        file_name,
-        generateIDs=False,
-        consolidate_header=False,
-        consolidate_citations=False,
-        include_raw_citations=False,
-        include_raw_affiliations=False,
-        tei_coordinates=False,
-        segment_sentences=False,
-    )
-    dict_information = xmltodict.parse(information[2])
-    title = dict_information["tei"]["teiHeader"]["fileDesc"]["titleStmt"]["title"]
-    abstract = dict_information["tei"]["teiHeader"]["profileDesc"]["abstract"]["p"]
-    return title
-def initiate_graph(file):
-    global qa_graph, current_file
-    if current_file != file.name:
-        qa_graph = None
-        current_file = file.name
-    loader = GenericLoader.from_filesystem(
-        file.name,
-        parser=GrobidParser(
             segment_sentences=False,
-            grobid_server="https://jpangas-grobid-paper-extractor.hf.space/api/processFulltextDocument",
-        ),
-    )
-    docs = loader.load()
-    embeddings = OpenAIEmbeddings()
-    vector_store = InMemoryVectorStore(embeddings)
-    llm = ChatOpenAI(model="gpt-4o-mini")
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000, chunk_overlap=200, add_start_index=True
-    )
-    all_splits = text_splitter.split_documents(docs)
-    vector_store.add_documents(documents=all_splits)
-    prompt = hub.pull("rlm/rag-prompt")
-    def retrieve(state: State):
-        retrieved_docs = vector_store.similarity_search(state["question"])
-        return {"context": retrieved_docs}
-    def generate(state: State):
-        docs_content = "\n\n".join(doc.page_content for doc in state["context"])
-        messages = prompt.invoke(
-            {"question": state["question"], "context": docs_content}
         )
-        response = llm.invoke(messages)
-        return {"answer": response.content}
-    graph_builder = StateGraph(State).add_sequence([retrieve, generate])
-    graph_builder.add_edge(START, "retrieve")
-    qa_graph = graph_builder.compile()
-    name = file.name.split("/")[-1]
-    return f"The paper {name} has been loaded and is ready for questions!"
-def answer_question(question, history):
-    global qa_graph, current_file
-    if qa_graph is None:
-        return "Please upload a PDF file first and wait for it to be loaded!"
-    response = qa_graph.invoke({"question": question})
-    return response["answer"]
-def slow_echo(message, history):
-    answer = answer_question(message, history)
-    if answer == "Please upload a PDF file first!":
-        yield answer
-        return
-    for i in range(len(answer)):
-        time.sleep(0.01)
-        yield answer[: i + 1]
 def main():
     with gr.Blocks() as demo:
         file_input = gr.File(
-            label="Upload a research paper as a pdf file and wait for it to be loaded",
             file_types=[".pdf"],
         )
@@ -117,11 +119,12 @@ def main():
             label="Status of Upload", value="No Paper Uploaded", interactive=False
         )
-        chat_interface = gr.ChatInterface(slow_echo, type="messages")
-        file_input.upload(fn=initiate_graph, inputs=file_input, outputs=textbox)
-    demo.queue().launch()
 if __name__ == "__main__":
     main()

 from typing_extensions import List, TypedDict
 import xmltodict
+class PaperQA:
+    def __init__(self):
+        self.qa_graph = None
+        self.current_file = None
+    class State(TypedDict):
+        question: str
+        context: List[Document]
+        answer: str
+    def get_extra_docs(self, file_name):
+        # TODO: Add the code to extract the title, authors, and abstract from the PDF file
+        client = GrobidClient(config_path="./config.json")
+        information = client.process_pdf(
+            "processHeaderDocument",
+            file_name,
+            generateIDs=False,
+            consolidate_header=False,
+            consolidate_citations=False,
+            include_raw_citations=False,
+            include_raw_affiliations=False,
+            tei_coordinates=False,
             segment_sentences=False,
         )
+        dict_information = xmltodict.parse(information[2])
+        title = dict_information["tei"]["teiHeader"]["fileDesc"]["titleStmt"]["title"]
+        abstract = dict_information["tei"]["teiHeader"]["profileDesc"]["abstract"]["p"]
+        return title
+    def initiate_graph(self, file):
+        if self.current_file != file.name:
+            self.qa_graph = None
+            self.current_file = file.name
+        loader = GenericLoader.from_filesystem(
+            file.name,
+            parser=GrobidParser(
+                segment_sentences=False,
+                grobid_server="https://jpangas-grobid-paper-extractor.hf.space/api/processFulltextDocument",
+            ),
+        )
+        docs = loader.load()
+        embeddings = OpenAIEmbeddings()
+        vector_store = InMemoryVectorStore(embeddings)
+        llm = ChatOpenAI(model="gpt-4o-mini")
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000, chunk_overlap=200, add_start_index=True
+        )
+        all_splits = text_splitter.split_documents(docs)
+        vector_store.add_documents(documents=all_splits)
+        prompt = hub.pull("rlm/rag-prompt")
+        def retrieve(state: self.State):
+            retrieved_docs = vector_store.similarity_search(state["question"])
+            return {"context": retrieved_docs}
+        def generate(state: self.State):
+            docs_content = "\n\n".join(doc.page_content for doc in state["context"])
+            messages = prompt.invoke(
+                {"question": state["question"], "context": docs_content}
+            )
+            response = llm.invoke(messages)
+            return {"answer": response.content}
+        graph_builder = StateGraph(self.State).add_sequence([retrieve, generate])
+        graph_builder.add_edge(START, "retrieve")
+        self.qa_graph = graph_builder.compile()
+        name = file.name.split("/")[-1]
+        return f"The paper {name} has been loaded and is ready for questions!"
+    def answer_question(self, question, history):
+        if self.qa_graph is None:
+            return "Please upload a PDF file first and wait for it to be loaded!"
+        response = self.qa_graph.invoke({"question": question})
+        return response["answer"]
+    def slow_echo(self, message, history):
+        answer = self.answer_question(message, history)
+        if answer == "Please upload a PDF file first!":
+            yield answer
+            return
+        for i in range(len(answer)):
+            time.sleep(0.01)
+            yield answer[: i + 1]
 def main():
+    qa_app = PaperQA()
     with gr.Blocks() as demo:
         file_input = gr.File(
+            label="Upload a research paper as a PDF file and wait for it to be loaded",
             file_types=[".pdf"],
         )
             label="Status of Upload", value="No Paper Uploaded", interactive=False
         )
+        chat_interface = gr.ChatInterface(qa_app.slow_echo, type="messages")
+        file_input.upload(fn=qa_app.initiate_graph, inputs=file_input, outputs=textbox)
+    demo.launch()
 if __name__ == "__main__":
     main()