Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| from paddleocr import PaddleOCR | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
| import os | |
| # Load Local Model (No API) | |
| model_name = "google/flan-t5-base" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| local_llm = pipeline("text2text-generation", model=model, tokenizer=tokenizer) | |
| # OCR Setup | |
| ocr_model = PaddleOCR(use_angle_cls=True, lang='en') | |
| documents = [] | |
| def extract_text(file): | |
| ext = os.path.splitext(file.name)[1].lower() | |
| text = "" | |
| if ext == ".pdf": | |
| reader = PdfReader(file) | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| elif ext in [".jpg", ".jpeg", ".png"]: | |
| result = ocr_model.ocr(file.name) | |
| text = " ".join([line[1][0] for line in result[0]]) | |
| return text | |
| def process_files(files): | |
| global documents | |
| documents = [] | |
| for f in files: | |
| text = extract_text(f) | |
| documents.append({"filename": f.name, "text": text}) | |
| return f"{len(files)} files processed and stored." | |
| def answer_query(query): | |
| if not documents: | |
| return "Please upload and process files first." | |
| prompt = "Analyze the following documents and answer the query:\n" | |
| for i, doc in enumerate(documents): | |
| prompt += f"\nDocument {i+1} ({doc['filename']}):\n{doc['text'][:2000]}\n" | |
| prompt += f"\n\nQuestion: {query}\nAnswer with themes and citations." | |
| try: | |
| response = local_llm(prompt, max_length=256, do_sample=True, temperature=0.7) | |
| return response[0]['generated_text'] | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| # Gradio Interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 📄 Document Theme Identification Chatbot (Offline Hugging Face Model)") | |
| with gr.Row(): | |
| file_input = gr.File(file_types=[".pdf", ".jpg", ".png"], file_count="multiple", label="Upload Documents") | |
| process_btn = gr.Button("Process Documents") | |
| process_output = gr.Textbox(label="Processing Status") | |
| with gr.Row(): | |
| query_input = gr.Textbox(label="Ask a question") | |
| query_btn = gr.Button("Get Answer") | |
| answer_output = gr.Textbox(label="Answer with Themes and Citations", lines=10) | |
| process_btn.click(fn=process_files, inputs=[file_input], outputs=[process_output]) | |
| query_btn.click(fn=answer_query, inputs=[query_input], outputs=[answer_output]) | |
| demo.launch() | |