LayMui commited on
Commit
ccd3265
·
verified ·
1 Parent(s): 31d5df4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add this import
2
+ import tempfile
3
+ from dotenv import load_dotenv
4
+ import os
5
+ from docling.document_converter import DocumentConverter
6
+ from langchain_community.document_loaders import TextLoader
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
+ from langchain_openai import ChatOpenAI
11
+ from langchain_community.chat_models import ChatAnthropic
12
+ from langchain.chains import RetrievalQA
13
+ import gradio as gr
14
+ import tempfile
15
+
16
+
17
+ llm = None
18
+ qa_chain = None
19
+ vectorstore = None
20
+ retriever = None
21
+
22
+ def process_pdf(file_path):
23
+ global vectorstore, retriever, qa_chain
24
+
25
+ # Parse PDF with Docling
26
+ converter = DocumentConverter()
27
+ result = converter.convert(file_path)
28
+ markdown_content = result.document.export_to_markdown()
29
+
30
+ # Save markdown temporarily
31
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp_md:
32
+ tmp_md.write(markdown_content)
33
+ tmp_md_path = tmp_md.name
34
+
35
+ # Load and split documents
36
+ loader = TextLoader(tmp_md_path)
37
+ documents = loader.load()
38
+
39
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
40
+ docs = splitter.split_documents(documents)
41
+
42
+ embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
43
+ vectorstore = FAISS.from_documents(docs, embeddings)
44
+ retriever = vectorstore.as_retriever()
45
+
46
+ # Rebuild QA chain with current LLM
47
+ if llm is not None:
48
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
49
+
50
+ def setup_chain(api_key, provider):
51
+ global llm, qa_chain
52
+
53
+ if provider.lower() == "anthropic":
54
+ os.environ["ANTHROPIC_API_KEY"] = api_key
55
+ llm = ChatAnthropic(model_name="claude-3-sonnet-20240229")
56
+ elif provider.lower() == "openai":
57
+ os.environ["OPENAI_API_KEY"] = api_key
58
+ llm = ChatOpenAI(model_name="gpt-4o")
59
+ else:
60
+ return "Unsupported provider. Please select 'openai' or 'anthropic'."
61
+
62
+ # If vectorstore and retriever are already set (PDF uploaded), rebuild qa_chain
63
+ global vectorstore, retriever
64
+ if retriever is not None:
65
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
66
+
67
+ return "API key and provider set successfully. You can now upload a PDF and ask questions."
68
+
69
+ def answer_question(user_input):
70
+ if qa_chain is None:
71
+ return "Please upload a PDF and set your API key and provider first."
72
+ if user_input.strip() == "":
73
+ return "Please enter a question."
74
+ response = qa_chain.run(user_input)
75
+ return response
76
+
77
+ import gradio as gr
78
+
79
+ with gr.Blocks() as iface:
80
+ gr.Markdown("# PDF Chat App with Docling and LangChain\nAsk questions directly from your PDF document.")
81
+
82
+ with gr.Row():
83
+ api_key_input = gr.Textbox(label="API Key", type="password", placeholder="Enter your API Key")
84
+ provider_input = gr.Dropdown(choices=["openai", "anthropic"], label="Provider", value="openai")
85
+ set_api_button = gr.Button("Set API Key and Provider")
86
+
87
+ api_status = gr.Textbox(label="Status", interactive=False)
88
+
89
+ pdf_uploader = gr.File(label="Upload PDF", file_types=[".pdf"])
90
+
91
+ question_input = gr.Textbox(lines=2, placeholder="Ask a question about the PDF...")
92
+ ask_button = gr.Button("Ask")
93
+ answer_output = gr.Textbox(label="Answer", interactive=False)
94
+
95
+ set_api_button.click(fn=setup_chain, inputs=[api_key_input, provider_input], outputs=api_status)
96
+ pdf_uploader.change(fn=process_pdf, inputs=pdf_uploader, outputs=api_status)
97
+ ask_button.click(fn=answer_question, inputs=question_input, outputs=answer_output)
98
+
99
+ iface.launch(share=True)