Spaces:

manabb
/

CharBotWithPDF

Sleeping

App Files Files Community

CharBotWithPDF / app.py

manabb

Update app.py

8fd40c9 verified about 1 month ago

raw

history blame contribute delete

21.2 kB


	# app.py
	import os
	import gradio as gr

	from langchain.vectorstores import FAISS
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.document_loaders import TextLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain.chains import RetrievalQA
	from langchain.llms import HuggingFacePipeline
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM
	from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader
	import pypdf
	from langchain.prompts import PromptTemplate
	from huggingface_hub import upload_folder
	from huggingface_hub import HfApi, upload_file
	from huggingface_hub import hf_hub_download
	from huggingface_hub import (
	file_exists,
	upload_file,
	repo_exists,
	create_repo,
	hf_hub_download
	)
	import shutil
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from langchain_huggingface import HuggingFacePipeline

	# Optional: Set HF Token if needed-allWrite
	os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("HF_TOKEN")
	api = HfApi(token=os.getenv("HF_TOKEN")) # Replace with your token
	# Initialize embedding model
	embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


	# Store the vraiables globally (across UI events)
	qa_chain = None
	qa_chain1 = None
	llm=None
	llm1=None
	repo_id=os.getenv("reposit_id")

	#=============================================google/flan-t5-small
	# Load HF model (lightweight for CPU)
	model_name = "google/flan-t5-small"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	# Wrap in pipeline
	#pipe1 = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
	pipe1 = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
	if llm1 is None:
	llm1 = HuggingFacePipeline(pipeline=pipe1)

	#=============================================TinyLlama/TinyLlama-1.1B-Chat-v1.0
	# Create optimized pipeline for TinyLlama
	pipe = pipeline(
	"text-generation",
	model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	tokenizer=AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
	device_map="auto" if torch.cuda.is_available() else None,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	max_new_tokens=512,
	temperature=0.7,
	top_p=0.95,
	do_sample=True,
	repetition_penalty=1.15,
	pad_token_id=tokenizer.eos_token_id if 'tokenizer' in locals() else 128001,
	trust_remote_code=True
	)

	# Build LangChain LLM wrapper
	if llm is None:
	llm = HuggingFacePipeline(pipeline=pipe)
	#=============================================

	def format_as_bullets(text):
	"""Convert answer to bullet points"""
	lines = text.strip().split('\n')
	bullet_lines = [f"• {line.strip()}" for line in lines if line.strip()]
	return '\n'.join(bullet_lines) if bullet_lines else text
	#=============================================

	def create_faiss_index(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
	"""Create FAISS index from PDF and upload to HF dataset repo"""
	message = "Index creation started"

	try:
	# Step 1: Create proper embeddings object (CRITICAL FIX)
	embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

	# Step 2: Clean temp directory
	if os.path.exists("temp_faiss"):
	shutil.rmtree("temp_faiss")

	# Step 3: Try PyPDFLoader first
	loader = PyPDFLoader(file)
	documents = loader.load()
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	new_docs = text_splitter.split_documents(documents)
	db = FAISS.from_documents(new_docs, embeddings)
	db.save_local("temp_faiss")
	# After db.save_local("temp_faiss")...

	# Step 4: Upload to HF Hub
	api = HfApi(token=os.getenv("HF_TOKEN"))
	api.upload_file(path_or_fileobj=file, path_in_repo=f"docs/{os.path.basename(file)}", repo_id=repo_id, repo_type="dataset")
	api.upload_file(path_or_fileobj="temp_faiss/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset")
	api.upload_file(path_or_fileobj="temp_faiss/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset")

	message = "✅ Index created successfully with PyPDFLoader and uploaded to repo"

	except Exception as e1:
	try:
	print(f"PyPDFLoader failed: {e1}")

	# Step 5: Fallback to PyMuPDFLoader
	loader = PyMuPDFLoader(file)
	documents = loader.load()
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	new_docs = text_splitter.split_documents(documents)

	# Use same embeddings instance
	embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
	db = FAISS.from_documents(new_docs, embeddings)
	db.save_local("temp_faiss")

	# Upload
	api = HfApi(token=os.getenv("HF_TOKEN"))
	api.upload_file(path_or_fileobj=file, path_in_repo=f"docs/{os.path.basename(file)}", repo_id=repo_id, repo_type="dataset")
	api.upload_file(path_or_fileobj="temp_faiss/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset")
	api.upload_file(path_or_fileobj="temp_faiss/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset")

	message = f"✅ PyPDFLoader failed ({e1}), PyMuPDFLoader succeeded and uploaded to repo"

	except Exception as e2:
	message = f"❌ Both loaders failed. PyPDF: {e1}, PyMuPDF: {e2}"

	finally:
	# Cleanup
	if os.path.exists("temp_faiss"):
	shutil.rmtree("temp_faiss")

	return message

	# Usage
	#result = create_faiss_index("your_username/your-dataset", "path/to/your/file.pdf")
	#print(result)
	#=============
	def update_faiss_from_hf(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
	"""Load existing FAISS from HF, add new docs, push updated version."""
	message = ""

	try:
	# Step 1: Create embeddings
	embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

	# Step 2: Download existing FAISS files
	print("Downloading existing FAISS index...")
	faiss_path = hf_hub_download(repo_id=repo_id, filename="index.faiss", repo_type="dataset")
	pkl_path = hf_hub_download(repo_id=repo_id, filename="index.pkl", repo_type="dataset")

	# Step 3: Load existing vectorstore
	folder_path = os.path.dirname(faiss_path)
	vectorstore = FAISS.load_local(
	folder_path=folder_path,
	embeddings=embeddings,
	allow_dangerous_deserialization=True
	)
	message += f"✅ Loaded existing index with {vectorstore.index.ntotal} vectors\n"

	# Step 4: Load new document with fallback
	documents = None
	loaders = [
	("PyPDFLoader", PyPDFLoader),
	("PyMuPDFLoader", PyMuPDFLoader)
	]

	for loader_name, LoaderClass in loaders:
	try:
	print(f"Trying {loader_name}...")
	loader = LoaderClass(file)
	documents = loader.load()
	message += f"✅ Loaded {len(documents)} pages with {loader_name}\n"
	break
	except Exception as e:
	message += f"❌ {loader_name} failed: {str(e)[:100]}...\n"
	continue

	if documents is None:
	return "❌ All PDF loaders failed"

	# Step 5: Split documents
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	new_docs = text_splitter.split_documents(documents)
	message += f"✅ Created {len(new_docs)} chunks from new document\n"

	# Step 6: Add new documents to existing index
	vectorstore.add_documents(new_docs)
	message += f"✅ Added to index. New total: {vectorstore.index.ntotal} vectors\n"

	# Step 7: Save updated index
	temp_dir = "temp_faiss_update"
	if os.path.exists(temp_dir):
	shutil.rmtree(temp_dir)
	vectorstore.save_local(temp_dir)

	# Step 8: Upload updated files
	api = HfApi(token=os.getenv("HF_TOKEN")) # Replace with your token
	api.upload_file(
	path_or_fileobj=file,
	path_in_repo=f"docs/{os.path.basename(file)}",
	repo_id=repo_id,
	repo_type="dataset"
	)
	api.upload_file(
	path_or_fileobj=f"{temp_dir}/index.faiss",
	path_in_repo="index.faiss",
	repo_id=repo_id,
	repo_type="dataset"
	)
	api.upload_file(
	path_or_fileobj=f"{temp_dir}/index.pkl",
	path_in_repo="index.pkl",
	repo_id=repo_id,
	repo_type="dataset"
	)

	message += f"✅ Successfully updated repo with {len(new_docs)} new chunks!"

	except Exception as e:
	message += f"❌ Update failed: {str(e)}"

	finally:
	# Cleanup
	if os.path.exists("temp_faiss_update"):
	shutil.rmtree("temp_faiss_update")

	return message

	# Usage
	# result = update_faiss_from_hf("yourusername/my-faiss-store", "new_document.pdf")
	# print(result)
	#====================
	def upload_and_prepare(file, user):
	mm = ""
	pdf_links = "No PDFs"

	if user != os.getenv("uploading_password"):
	return "❌ Unauthorized User", pdf_links

	try:
	if file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"):
	mm = update_faiss_from_hf(repo_id, file)
	else:
	mm = create_faiss_index(repo_id, file)

	# NOW this runs - generate PDF list
	api = HfApi(token=os.getenv("HF_TOKEN"))
	pdf_files = api.list_repo_files(repo_id, repo_type="dataset")
	pdf_links = "\n".join([f"• [📄 {f}](https://huggingface.co/datasets/{repo_id}/resolve/main/{f})"
	for f in pdf_files if f.endswith('.pdf')])
	except Exception as e:
	mm += f"\n❌ Error: {e}"

	return mm, pdf_links


	#============
	def upload_and_prepare_old(file,user):
	#==============================
	#=============================
	# Load & split document
	mm=""
	if user == os.getenv("uploading_password"):
	if file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"):
	mm=update_faiss_from_hf(repo_id, file)
	#mm="✅ Document processed. New index added. You can now ask questions!"
	if not file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"):
	mm=create_faiss_index(repo_id, file)
	#mm="✅ Document processed. New index created. You can now ask questions!"
	else:
	mm="❌ Unauthorized User"
	# After successful upload
	api = HfApi(token=os.getenv("HF_TOKEN")) # Replace with your token
	pdf_files = api.list_repo_files(repo_id, repo_type="dataset")
	pdf_links = "\n".join([f"• [📄 {f}](https://huggingface.co/datasets/{repo_id}/resolve/main/{f})"
	for f in pdf_files if f.endswith('.pdf')])
	return mm, pdf_links # Update both outputs
	#return mm
	#create_faiss_index(repo_id, file_input)
	#======================================================================
	def generate_qa_chain(repo_id, embedding_model="sentence-transformers/all-MiniLM-L6-v2", llm=None):
	"""
	Generate QA chain from HF dataset repo FAISS index
	"""
	try:
	# Step 1: Create embeddings (FIX: was missing)
	embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

	# Step 2: Download FAISS files from HF Hub
	faiss_path = hf_hub_download(
	repo_id=repo_id,
	filename="index.faiss",
	repo_type="dataset"
	)
	pkl_path = hf_hub_download(
	repo_id=repo_id,
	filename="index.pkl",
	repo_type="dataset"
	)

	# Step 3: Load FAISS vectorstore (FIX: pass embeddings object, not string)
	folder_path = os.path.dirname(faiss_path)
	vectorstore = FAISS.load_local(
	folder_path=folder_path,
	embeddings=embeddings, # FIXED: was 'embedding_model' string
	allow_dangerous_deserialization=True
	)

	# Step 4: Create retriever
	retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

	# Step 5: Custom prompt template
	prompt_template = PromptTemplate(
	input_variables=["context", "question"],
	template="""
	Answer strictly based on the context below.
	Mention rule number / circular reference and PAGE NUMBER..
	Add interpretation.

	If answer is not found, say "Not available in the provided context".

	Question: {question}

	Context: {context}

	Answer (include page references):
	"""
	)

	# Step 6: Setup RetrievalQA chain
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm, # Make sure llm is passed or defined globally
	chain_type="stuff",
	chain_type_kwargs={"prompt": prompt_template},
	retriever=retriever,
	return_source_documents=True
	)
	except Exception as e:
	print(f"Error in generate_qa_chain: {e}")
	return None
	return qa_chain

	# Usage example:
	# llm = HuggingFacePipeline(...) # Your LLM setup
	# qa = generate_qa_chain("your_username/your-dataset", llm=llm)
	# result = qa.invoke({"query": "What is the main rule?"})
	# print(result["result"])

	#============================
	def bePrepare():
	global qa_chain
	qa_chain = generate_qa_chain(repo_id,llm=llm)
	return "I am ready, ask me questions with model tiny Lama."

	def bePrepare1():
	global qa_chain1
	qa_chain1 = generate_qa_chain(repo_id,llm=llm1)
	return "I am ready, ask me questions with model google flan-t5."

	def ask_question(query):
	if not query or not qa_chain:
	return "❌ Please click prepare button first and check whether question is empty"

	response = qa_chain.invoke({"query": query})
	result = response["result"]
	bullet_result = format_as_bullets(result)
	sources = response.get("source_documents", [])

	source_info = ""
	for i, doc in enumerate(sources[:3]):
	page_num = doc.metadata.get('page', 'Unknown')
	filename = os.path.basename(doc.metadata.get('source', 'Unknown'))
	repo_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/docs/{filename}"
	source_info += f"\nSource {i+1}: [{filename} (Page {page_num})]({repo_url})"

	return f"{result}\n\n In bullet form \n{bullet_result}\n\n📄 Sources:{source_info}"

	def ask_question1(query):
	if not query or not qa_chain1:
	return "❌ Please click prepare button first and check whether question is empty"

	response = qa_chain1.invoke({"query": query})
	result = response["result"]
	bullet_result = format_as_bullets(result)
	sources = response.get("source_documents", [])

	source_info = ""
	for i, doc in enumerate(sources[:3]):
	page_num = doc.metadata.get('page', 'Unknown')
	filename = os.path.basename(doc.metadata.get('source', 'Unknown'))
	repo_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/docs/{filename}"
	source_info += f"\nSource {i+1}: [{filename} (Page {page_num})]({repo_url})"

	return f"{result}\n\n In bullet form \n{bullet_result}\n\n📄 Sources:{source_info}"
	#===============================================
	#delete entire repo
	def delete_entire_repo(user):
	mx="Unauthorized user."
	repo=os.getenv("reposit_id")
	if user != os.getenv("uploading_password"):
	return "❌ Unauthorized user"
	try:
	api = HfApi(token=os.getenv("HF_TOKEN"))
	api.delete_repo(repo_id=repo, repo_type="dataset")
	api.create_repo(repo_id=repo, repo_type="dataset", private=False)
	return f"✅ Repo {repo_id} reset successfully"
	except Exception as e:
	mx=f"❌ error during deletetion & creation of repo: {e} "
	#===============================================
	# ❌ Static (never updates)
	# pdf_list = gr.Markdown("No documents loaded yet.")

	# ✅ Dynamic function
	def get_pdf_list():
	repo_id=os.getenv("reposit_id")
	try:

	api = HfApi(token=os.getenv("HF_TOKEN"))
	files = api.list_repo_files(repo_id, repo_type="dataset")

	pdf_files = [f for f in files if f.endswith('.pdf')]
	if not pdf_files:
	return "No PDF documents in repo yet."

	links = []
	for pdf in pdf_files:
	url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{pdf}"
	links.append(f"• [📄 {os.path.basename(pdf)}]({url})")

	return f"📚 Uploaded PDFs ({len(pdf_files)}):\n" + "\n".join(links)
	except Exception as ee:
	print (ee)
	return f"❌ Cannot load PDF listerror: {ee}"

	#===============================================
	# Gradio UI
	with gr.Blocks(title="N R L C H A T B O T - for commercial procurement - Supply", css="""
	#blue-col { background: linear-gradient(135deg, #667eea, #764ba2); padding: 20px; border-radius: 10px; }
	#green-col { background: #4ecdc4; padding: 20px; border-radius: 10px; }
	""") as demo:
	gr.Markdown("## 🧠 For use of NRL procurement department Only")
	with gr.Row():
	# LEFT COLUMN: TinyLama Model
	with gr.Column(elem_id="blue-col",scale=1):
	gr.Markdown("## 🧠 Using heavy TinyLama Model")
	with gr.Row():
	Index_processing_output=gr.Textbox(label="📁 Status for tiny lama", interactive=False)
	with gr.Row():
	Index_processing_btn = gr.Button("🔄 Clik to get the udated resources with tiny Lama")
	Index_processing_btn.click(bePrepare, inputs=None, outputs=Index_processing_output)
	with gr.Row():
	query_input = gr.Textbox(label="❓ Your Question pls")
	with gr.Row():
	query_btn = gr.Button("🧠 Get Answer")
	with gr.Row():
	answer_output = gr.Textbox(
	label="✅ Answer with Document Links",
	lines=8
	)
	query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
	# RIGHT COLUMN: google\flan-t5
	with gr.Column(elem_id="green-col",scale=2):
	gr.Markdown("## 🧠 Using ligth model - google flan-t5")
	Index_processing_output1=gr.Textbox(label="📁 Status for google flan-t5", interactive=False)
	Index_processing_btn1 = gr.Button("🔄 Clik to get the udated resources with google flan-t5")
	Index_processing_btn1.click(bePrepare1, inputs=None, outputs=Index_processing_output1)
	query_input1 = gr.Textbox(label="❓ Your Question pls")
	query_btn1 = gr.Button("🧠 Get Answer")
	answer_output1 = gr.Textbox(
	label="✅ Answer with Document Links",
	lines=8
	)
	summary_output = gr.Markdown("Summary will appear here")
	query_btn1.click(
	ask_question1,
	inputs=query_input1,
	outputs=answer_output1
	)

	with gr.Row():
	# LEFT COLUMN: Document Management
	with gr.Column(elem_id="green-col",scale=1):
	gr.Markdown("## 📚 Uploaded Documents")
	with gr.Row():
	pdf_list = gr.Markdown("No documents loaded yet.")
	refresh_btn = gr.Button("🔄 Refresh")
	refresh_btn.click(get_pdf_list,inputs=None,outputs=pdf_list)
	with gr.Column(elem_id="blue-col",scale=1):
	gr.Markdown("## 🧠 For uploading new PDF documents.")
	with gr.Row():
	output_msg = gr.Textbox(label="📁 Authorization Message", interactive=False)
	with gr.Row():
	file_input = gr.File(label="📄 Upload .pdf File by only authorized user", type="filepath")
	with gr.Row():
	authorized_user=gr.Textbox(label="Write the password to upload new Circular Doc.")
	with gr.Row():
	upload_btn = gr.Button("🔄 Process Doc")
	upload_btn.click(upload_and_prepare, inputs=[file_input,authorized_user], outputs=[output_msg,pdf_list])
	with gr.Row():
	delete_btn = gr.Button("🔄 Delete complete repo")
	delete_btn.click(delete_entire_repo, inputs=authorized_user, outputs=output_msg)



	# For local dev use: demo.launch()
	# For HF Spaces

	if __name__ == "__main__":
	demo.launch()