Michtiii commited on
Commit
d277fdd
·
verified ·
1 Parent(s): 9581cf4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -47
app.py CHANGED
@@ -1,68 +1,65 @@
1
- # app.py
2
  import os
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.vectorstores import FAISS
 
7
  from langchain.chains import RetrievalQA
8
- from langchain.llms import HuggingFaceHub
9
  import gradio as gr
10
 
11
- # --------- Configuration ---------
12
- DOCS_FOLDER = "Docs" # Folder containing PDF files
13
- HF_API_KEY = os.getenv("HF_TOKEN") # HuggingFace API token
 
 
14
 
15
- # --------- Load and process PDFs ---------
16
- def load_pdfs(folder):
17
- texts = []
18
- for file in os.listdir(folder):
19
- if file.endswith(".pdf"):
20
- pdf_path = os.path.join(folder, file)
21
- reader = PdfReader(pdf_path)
22
- for page in reader.pages:
23
- texts.append(page.extract_text())
24
- return texts
25
 
26
- raw_texts = load_pdfs(DOCS_FOLDER)
27
 
28
- # Split into smaller chunks for embeddings
 
 
29
  text_splitter = RecursiveCharacterTextSplitter(
30
  chunk_size=1000,
31
- chunk_overlap=50
32
  )
33
- docs = text_splitter.split_text(" ".join(raw_texts))
34
 
35
- # --------- Create embeddings and vectorstore ---------
36
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
37
- vectorstore = FAISS.from_texts(docs, embedding=embeddings)
 
 
38
 
39
- # --------- Setup LLM & Retrieval QA chain ---------
40
- llm = HuggingFaceHub(
41
- repo_id="google/flan-t5-large",
42
- model_kwargs={"temperature": 0, "max_length": 512},
43
- huggingfacehub_api_token=HF_API_KEY
44
- )
45
- qa_chain = RetrievalQA.from_chain_type(
46
- llm=llm,
47
- chain_type="stuff",
48
- retriever=vectorstore.as_retriever()
49
  )
50
 
51
- # --------- Gradio interface ---------
52
- def answer_query(query):
53
- return qa_chain.run(query)
 
 
54
 
55
  with gr.Blocks() as demo:
56
- gr.Markdown("## PDF Document RAG QA System")
57
- chatbot = gr.Chatbot()
58
- msg = gr.Textbox(label="Enter your question:")
59
- submit = gr.Button("Ask")
60
-
61
- def chat_fn(user_input, chat_history):
62
- answer = answer_query(user_input)
63
- chat_history.append((user_input, answer))
64
- return chat_history, ""
65
-
66
- submit.click(chat_fn, inputs=[msg, chatbot], outputs=[chatbot, msg])
67
 
68
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
  import os
2
  from PyPDF2 import PdfReader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
4
  from langchain.vectorstores import FAISS
5
+ from langchain.embeddings import SentenceTransformerEmbeddings
6
  from langchain.chains import RetrievalQA
7
+ from langchain.chat_models import ChatOpenAI # or HuggingFaceChatModel
8
  import gradio as gr
9
 
10
+ # -----------------------------
11
+ # 1. Load PDF files
12
+ # -----------------------------
13
+ docs_path = "Docs"
14
+ all_texts = []
15
 
16
+ for file in os.listdir(docs_path):
17
+ if file.endswith(".pdf"):
18
+ pdf = PdfReader(os.path.join(docs_path, file))
19
+ text = ""
20
+ for page in pdf.pages:
21
+ text += page.extract_text() or ""
22
+ all_texts.append(text)
 
 
 
23
 
24
+ full_text = "\n".join(all_texts)
25
 
26
+ # -----------------------------
27
+ # 2. Split text into chunks
28
+ # -----------------------------
29
  text_splitter = RecursiveCharacterTextSplitter(
30
  chunk_size=1000,
31
+ chunk_overlap=200
32
  )
33
+ texts = text_splitter.split_text(full_text)
34
 
35
+ # -----------------------------
36
+ # 3. Create embeddings and vector store
37
+ # -----------------------------
38
+ embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
39
+ vectorstore = FAISS.from_texts(texts, embedding_model)
40
 
41
+ # -----------------------------
42
+ # 4. Create retrieval QA chain
43
+ # -----------------------------
44
+ llm = ChatOpenAI(temperature=0) # or use HuggingFace model if you prefer
45
+ qa = RetrievalQA.from_chain_type(
46
+ llm=llm,
47
+ retriever=vectorstore.as_retriever(),
48
+ chain_type="stuff" # simple summarization chain
 
 
49
  )
50
 
51
+ # -----------------------------
52
+ # 5. Gradio interface
53
+ # -----------------------------
54
+ def answer_question(query):
55
+ return qa.run(query)
56
 
57
  with gr.Blocks() as demo:
58
+ gr.Markdown("# PDF RAG + Summarization Chatbot")
59
+ with gr.Row():
60
+ query_input = gr.Textbox(label="Ask a question about your PDFs")
61
+ output_box = gr.Textbox(label="Answer")
62
+ query_input.submit(answer_question, inputs=query_input, outputs=output_box)
63
+ gr.Button("Submit").click(answer_question, inputs=query_input, outputs=output_box)
 
 
 
 
 
64
 
65
  demo.launch(server_name="0.0.0.0", server_port=7860)