himanshukumar378 commited on
Commit
c630cd2
·
verified ·
1 Parent(s): 60db15e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -107
app.py CHANGED
@@ -1,135 +1,120 @@
1
  import gradio as gr
2
- import os
3
  from PyPDF2 import PdfReader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
5
  from langchain_community.vectorstores import FAISS
6
- from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
7
- from langchain.prompts import PromptTemplate
8
- from transformers import pipeline
9
-
10
- # -----------------------------
11
- # 🔹 Model Fallback Setup
12
- # -----------------------------
13
- HF_MODELS = [
14
- "google/flan-t5-small",
15
- "google/flan-t5-base",
16
- "google/flan-t5-large",
17
- "google/flan-ul2"
18
- ]
19
 
 
 
20
  def load_llm():
21
- """Try loading Hugging Face models in order, return first successful LLM."""
22
- for model_name in HF_MODELS:
 
 
 
 
 
 
23
  try:
24
- print(f"🔄 Trying to load model: {model_name}")
 
25
  pipe = pipeline(
26
  "text2text-generation",
27
- model=model_name,
28
- tokenizer=model_name,
29
- max_new_tokens=512,
30
  )
31
- return HuggingFacePipeline(pipeline=pipe), model_name
 
32
  except Exception as e:
33
- print(f"⚠️ Failed to load {model_name}: {e}")
34
- raise RuntimeError("❌ Could not load any Hugging Face model.")
35
-
36
- # Load at startup
37
- llm, active_model = load_llm()
38
- print(f"✅ Using model: {active_model}")
39
-
40
- # -----------------------------
41
- # 🔹 PDF Processing
42
- # -----------------------------
43
- def process_pdf(pdf_paths):
44
- """Extract text from PDFs, chunk it, and return FAISS vector DB."""
45
  text = ""
46
- for pdf_path in pdf_paths:
47
- try:
48
- reader = PdfReader(pdf_path)
49
- for page in reader.pages:
50
- page_text = page.extract_text()
51
- if page_text:
52
- text += page_text + "\n"
53
- except Exception as e:
54
- print(f"⚠️ Error reading {pdf_path}: {e}")
55
 
56
  if not text.strip():
57
- return None
58
 
59
  # Split text into chunks
60
- text_splitter = RecursiveCharacterTextSplitter(
61
- chunk_size=1000, chunk_overlap=200
62
- )
63
- chunks = text_splitter.split_text(text)
64
 
65
- # Convert chunks into vector DB
66
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
67
- db = FAISS.from_texts(chunks, embedding=embeddings)
 
68
  return db
69
 
70
- # -----------------------------
71
- # 🔹 Question Answering
72
- # -----------------------------
73
- def ask_question(pdf_paths, question):
74
- if not pdf_paths:
75
- return "⚠️ Please upload at least one PDF."
76
- if not question or not question.strip():
77
- return "⚠️ Please enter a question."
78
-
79
- db = process_pdf(pdf_paths)
80
- if db is None:
81
- return "⚠️ Couldn't extract any text from the PDFs."
82
-
83
- retriever = db.as_retriever(search_kwargs={"k": 3})
84
- docs = retriever.get_relevant_documents(question)
85
- context = "\n".join(getattr(d, "page_content", str(d)) for d in docs)
86
-
87
- prompt = PromptTemplate(
88
- input_variables=["context", "question"],
89
- template=(
90
- "Answer the question using ONLY the context below. "
91
- "If the answer isn't in the context, say you don't know.\n\n"
92
- "Context:\n{context}\n\nQuestion: {question}\nAnswer:"
93
- ),
94
- )
95
-
96
- final_prompt = prompt.format(context=context, question=question)
97
-
98
- # Try multiple models for answering
99
- for model_name in HF_MODELS:
100
- try:
101
- pipe = pipeline(
102
- "text2text-generation",
103
- model=model_name,
104
- tokenizer=model_name,
105
- max_new_tokens=512,
106
- )
107
- llm = HuggingFacePipeline(pipeline=pipe)
108
- result = llm.invoke(final_prompt)
109
- return str(getattr(result, "content", result)) + f"\n\n✅ Answered using {model_name}"
110
- except Exception as e:
111
- print(f"⚠️ Model {model_name} failed: {e}")
112
- continue
113
 
114
- return "❌ All models failed to generate an answer."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- # -----------------------------
117
- # 🔹 Gradio UI
118
- # -----------------------------
 
 
 
 
119
  with gr.Blocks() as demo:
120
- gr.Markdown("## 📘 PDF Question Answering with Hugging Face (Fallback Models)")
 
 
 
 
 
 
 
 
 
121
 
122
  with gr.Row():
123
- pdf_input = gr.File(label="Upload PDFs", file_types=[".pdf"], file_types_metadata=None, type="filepath", file_count="multiple")
124
  question_input = gr.Textbox(label="Ask a Question")
125
 
126
- answer_output = gr.Textbox(label="Answer", lines=10)
 
127
 
128
- ask_btn = gr.Button("Get Answer")
129
- ask_btn.click(fn=ask_question, inputs=[pdf_input, question_input], outputs=answer_output)
130
 
131
- # -----------------------------
132
- # 🔹 Launch App
133
- # -----------------------------
134
- if __name__ == "__main__":
135
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
 
2
  from PyPDF2 import PdfReader
3
+
4
+ # LangChain components
5
+ from langchain.text_splitter import CharacterTextSplitter
6
  from langchain_community.vectorstores import FAISS
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain_core.prompts import PromptTemplate
9
+ from langchain_community.llms import HuggingFacePipeline
10
+
11
+ # Hugging Face Transformers
12
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
 
 
 
 
 
 
13
 
14
+
15
+ # ---------------- Load LLM with fallback ----------------
16
  def load_llm():
17
+ model_ids = [
18
+ "google/flan-t5-small", # lightweight, safe
19
+ "google/flan-t5-base", # more powerful
20
+ "google/flan-t5-large", # stronger, but bigger
21
+ "google/flan-t5-xl" # may fail in free tier, but used if available
22
+ ]
23
+
24
+ for model_id in model_ids:
25
  try:
26
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
27
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
28
  pipe = pipeline(
29
  "text2text-generation",
30
+ model=model,
31
+ tokenizer=tokenizer,
32
+ max_length=512
33
  )
34
+ print(f"✅ Loaded model: {model_id}")
35
+ return HuggingFacePipeline(pipeline=pipe)
36
  except Exception as e:
37
+ print(f"⚠️ Failed to load {model_id}: {e}")
38
+ continue
39
+
40
+ raise RuntimeError("❌ No model could be loaded. Please check Hugging Face space resources.")
41
+
42
+
43
+ llm = load_llm()
44
+
45
+
46
+ # ---------------- Process PDF ----------------
47
+ def process_pdf(pdf_files):
 
48
  text = ""
49
+ for pdf in pdf_files:
50
+ reader = PdfReader(pdf.name)
51
+ for page in reader.pages:
52
+ extracted = page.extract_text()
53
+ if extracted:
54
+ text += extracted + "\n"
 
 
 
55
 
56
  if not text.strip():
57
+ return None # return None if empty
58
 
59
  # Split text into chunks
60
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
61
+ texts = splitter.split_text(text)
 
 
62
 
63
+ # Embeddings & vector store
64
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
65
+ db = FAISS.from_texts(texts, embeddings)
66
+
67
  return db
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ # ---------------- Ask Questions ----------------
71
+ def ask_question(pdf_files, question):
72
+ try:
73
+ db = process_pdf(pdf_files)
74
+ if not db:
75
+ return "⚠️ No text found in the uploaded PDF(s)."
76
+
77
+ retriever = db.as_retriever(search_kwargs={"k": 3})
78
+ docs = retriever.get_relevant_documents(question)
79
+
80
+ # Combine retrieved context
81
+ context = "\n".join([doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs])
82
+
83
+ # Prompt template
84
+ prompt = PromptTemplate(
85
+ input_variables=["context", "question"],
86
+ template="Answer the question using the following context:\n{context}\n\nQuestion: {question}\nAnswer:"
87
+ )
88
+
89
+ final_prompt = prompt.format(context=context, question=question)
90
+ response = llm.invoke(final_prompt)
91
 
92
+ return response if response else "⚠️ No answer generated. Try another question."
93
+
94
+ except Exception as e:
95
+ return f"⚠️ Error while generating answer: {str(e)}"
96
+
97
+
98
+ # ---------------- Gradio UI ----------------
99
  with gr.Blocks() as demo:
100
+ gr.Markdown("## 📚 Multiple PDF Chatbot (with Hugging Face fallback models)")
101
+
102
+ with gr.Row():
103
+ pdf_input = gr.File(
104
+ file_types=[".pdf"],
105
+ type="file",
106
+ label="Upload PDF(s)",
107
+ file_types=[".pdf"],
108
+ file_types_multiple=True
109
+ )
110
 
111
  with gr.Row():
 
112
  question_input = gr.Textbox(label="Ask a Question")
113
 
114
+ with gr.Row():
115
+ output = gr.Textbox(label="Answer")
116
 
117
+ submit = gr.Button("Submit")
118
+ submit.click(fn=ask_question, inputs=[pdf_input, question_input], outputs=output)
119
 
120
+ demo.launch()