smartchabot / app.py
tanya17's picture
Update app.py
e314452 verified
import gradio as gr
from PyPDF2 import PdfReader
from paddleocr import PaddleOCR
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import os
# Load Local Model (No API)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
local_llm = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
# OCR Setup
ocr_model = PaddleOCR(use_angle_cls=True, lang='en')
documents = []
def extract_text(file):
ext = os.path.splitext(file.name)[1].lower()
text = ""
if ext == ".pdf":
reader = PdfReader(file)
for page in reader.pages:
text += page.extract_text() or ""
elif ext in [".jpg", ".jpeg", ".png"]:
result = ocr_model.ocr(file.name)
text = " ".join([line[1][0] for line in result[0]])
return text
def process_files(files):
global documents
documents = []
for f in files:
text = extract_text(f)
documents.append({"filename": f.name, "text": text})
return f"{len(files)} files processed and stored."
def answer_query(query):
if not documents:
return "Please upload and process files first."
prompt = "Analyze the following documents and answer the query:\n"
for i, doc in enumerate(documents):
prompt += f"\nDocument {i+1} ({doc['filename']}):\n{doc['text'][:2000]}\n"
prompt += f"\n\nQuestion: {query}\nAnswer with themes and citations."
try:
response = local_llm(prompt, max_length=256, do_sample=True, temperature=0.7)
return response[0]['generated_text']
except Exception as e:
return f"❌ Error: {str(e)}"
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# 📄 Document Theme Identification Chatbot (Offline Hugging Face Model)")
with gr.Row():
file_input = gr.File(file_types=[".pdf", ".jpg", ".png"], file_count="multiple", label="Upload Documents")
process_btn = gr.Button("Process Documents")
process_output = gr.Textbox(label="Processing Status")
with gr.Row():
query_input = gr.Textbox(label="Ask a question")
query_btn = gr.Button("Get Answer")
answer_output = gr.Textbox(label="Answer with Themes and Citations", lines=10)
process_btn.click(fn=process_files, inputs=[file_input], outputs=[process_output])
query_btn.click(fn=answer_query, inputs=[query_input], outputs=[answer_output])
demo.launch()