Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import google.generativeai as genai | |
| from PyPDF2 import PdfReader | |
| from paddleocr import PaddleOCR | |
| import os | |
| # Step 1: Gemini API Key (must be set in Hugging Face Secrets) | |
| genai.configure(api_key=os.getenv("GEMINI_API_KEY")) | |
| model = genai.GenerativeModel('gemini-pro') | |
| # Step 2: OCR Setup | |
| ocr_model = PaddleOCR(use_angle_cls=True, lang='en') | |
| documents = [] | |
| def extract_text(file): | |
| ext = os.path.splitext(file.name)[1].lower() | |
| text = "" | |
| if ext == ".pdf": | |
| reader = PdfReader(file) | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| elif ext in [".jpg", ".jpeg", ".png"]: | |
| result = ocr_model.ocr(file.name) | |
| text = " ".join([line[1][0] for line in result[0]]) | |
| return text | |
| def process_files(files): | |
| global documents | |
| documents = [] | |
| for f in files: | |
| text = extract_text(f) | |
| documents.append({"filename": f.name, "text": text}) | |
| return f"{len(files)} files processed and stored." | |
| def answer_query(query): | |
| if not documents: | |
| return "Please upload and process files first." | |
| prompt = "You are a research assistant. Analyze the following documents and answer the query.\n" | |
| for i, doc in enumerate(documents): | |
| prompt += f"\nDocument {i+1} ({doc['filename']}):\n{doc['text'][:2000]}\n" | |
| prompt += f"\n\nQuestion: {query}\nAnswer with key themes and cite document numbers." | |
| response = model.generate_content(prompt) | |
| return response.text | |
| # Step 3: Gradio Interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 📄 Gemini Document Research & Theme Identification Chatbot") | |
| with gr.Row(): | |
| file_input = gr.File(file_types=[".pdf", ".jpg", ".png"], file_count="multiple", label="Upload Documents") | |
| process_btn = gr.Button("Process Documents") | |
| process_output = gr.Textbox(label="Processing Status") | |
| with gr.Row(): | |
| query_input = gr.Textbox(label="Ask a question") | |
| query_btn = gr.Button("Get Answer") | |
| answer_output = gr.Textbox(label="Answer with Themes and Citations", lines=10) | |
| process_btn.click(fn=process_files, inputs=[file_input], outputs=[process_output]) | |
| query_btn.click(fn=answer_query, inputs=[query_input], outputs=[answer_output]) | |
| demo.launch() |