Spaces:

tanya17
/

smartchabot

Sleeping

App Files Files Community

smartchabot / app.py

tanya17

Update app.py

e314452 verified 11 months ago

raw

history blame contribute delete

2.48 kB

	import gradio as gr
	from PyPDF2 import PdfReader
	from paddleocr import PaddleOCR
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
	import os

	# Load Local Model (No API)
	model_name = "google/flan-t5-base"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	local_llm = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

	# OCR Setup
	ocr_model = PaddleOCR(use_angle_cls=True, lang='en')
	documents = []

	def extract_text(file):
	ext = os.path.splitext(file.name)[1].lower()
	text = ""
	if ext == ".pdf":
	reader = PdfReader(file)
	for page in reader.pages:
	text += page.extract_text() or ""
	elif ext in [".jpg", ".jpeg", ".png"]:
	result = ocr_model.ocr(file.name)
	text = " ".join([line[1][0] for line in result[0]])
	return text

	def process_files(files):
	global documents
	documents = []
	for f in files:
	text = extract_text(f)
	documents.append({"filename": f.name, "text": text})
	return f"{len(files)} files processed and stored."

	def answer_query(query):
	if not documents:
	return "Please upload and process files first."

	prompt = "Analyze the following documents and answer the query:\n"
	for i, doc in enumerate(documents):
	prompt += f"\nDocument {i+1} ({doc['filename']}):\n{doc['text'][:2000]}\n"
	prompt += f"\n\nQuestion: {query}\nAnswer with themes and citations."

	try:
	response = local_llm(prompt, max_length=256, do_sample=True, temperature=0.7)
	return response[0]['generated_text']
	except Exception as e:
	return f"❌ Error: {str(e)}"

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("# 📄 Document Theme Identification Chatbot (Offline Hugging Face Model)")

	with gr.Row():
	file_input = gr.File(file_types=[".pdf", ".jpg", ".png"], file_count="multiple", label="Upload Documents")
	process_btn = gr.Button("Process Documents")

	process_output = gr.Textbox(label="Processing Status")

	with gr.Row():
	query_input = gr.Textbox(label="Ask a question")
	query_btn = gr.Button("Get Answer")

	answer_output = gr.Textbox(label="Answer with Themes and Citations", lines=10)

	process_btn.click(fn=process_files, inputs=[file_input], outputs=[process_output])
	query_btn.click(fn=answer_query, inputs=[query_input], outputs=[answer_output])

	demo.launch()