Spaces:

admin08077
/

aitr

Sleeping

App Files Files Community

admin08077 commited on Jan 14

Commit

453fb59

verified ·

1 Parent(s): 6bfc829

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -385

app.py CHANGED Viewed

@@ -1,398 +1,215 @@
 import gradio as gr
-import os
 import io
 import base64
-import uuid
-import nltk
-import torch
-import tiktoken
-import pytesseract
-import PyPDF2
-import cv2
-import tempfile
-from PIL import Image
-# Transformers
-from transformers import (
-    pipeline,
-    AutoTokenizer,
-    AutoModelForSequenceClassification,
-    AutoModelForSeq2SeqLM,
-    AutoModelForQuestionAnswering,
-)
-# We will use a local whisper model for STT
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 nltk.download("punkt", quiet=True)
 ###############################################################################
-#                           1. Load All Models Locally                        #
-###############################################################################
-# Zero-Shot Classification Model (Topic Detection)
-ZSC_MODEL_NAME = "facebook/bart-large-mnli"
-zsc_tokenizer = AutoTokenizer.from_pretrained(ZSC_MODEL_NAME, force_download=True)
-zsc_model = AutoModelForSequenceClassification.from_pretrained(ZSC_MODEL_NAME, force_download=True)
-zero_shot_classifier = pipeline("zero-shot-classification", model=zsc_model, tokenizer=zsc_tokenizer)
-# Summarization Model (Chunk-based Summaries)
-SUM_MODEL_NAME = "facebook/bart-large-cnn"
-sum_tokenizer = AutoTokenizer.from_pretrained(SUM_MODEL_NAME, force_download=True)
-sum_model = AutoModelForSeq2SeqLM.from_pretrained(SUM_MODEL_NAME, force_download=True)
-summarizer = pipeline("summarization", model=sum_model, tokenizer=sum_tokenizer)
-# QA Model (Chunk-based QA)
-QA_MODEL_NAME = "deepset/roberta-base-squad2"
-qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, force_download=True)
-qa_model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL_NAME, force_download=True)
-qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)
-# Speech-to-Text (STT) with tiny Whisper
-WHISPER_MODEL_NAME = "openai/whisper-tiny"
-whisper_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME, force_download=True)
-whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_NAME, force_download=True)
-# For real-time token usage, we'll use tiktoken (GPT-3.5 style tokenizer)
-encoding = tiktoken.get_encoding("cl100k_base")
-###############################################################################
-#                           2. Utility Functions                              #
-###############################################################################
-def approximate_tokens(text: str) -> int:
-    return len(encoding.encode(text))
-def chunk_text(text, max_chunk_size=1500):
-    sentences = nltk.sent_tokenize(text)
-    chunks = []
-    current_chunk = ""
-    current_tokens = 0
-    for sent in sentences:
-        sent_tokens = approximate_tokens(sent)
-        if current_tokens + sent_tokens <= max_chunk_size:
-            current_chunk += " " + sent
-            current_tokens += sent_tokens
-        else:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            current_chunk = sent
-            current_tokens = sent_tokens
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-def chunk_summarize(text):
-    chunks = chunk_text(text, max_chunk_size=600)
-    summaries = []
-    for ch in chunks:
-        out = summarizer(ch, max_length=150, min_length=40, do_sample=False)
-        summaries.append(out[0]["summary_text"])
-    combined = " ".join(summaries)
-    if len(chunks) > 1:
-        final = summarizer(combined, max_length=150, min_length=40, do_sample=False)
-        return final[0]["summary_text"]
     else:
-        return combined
-def do_topic_detection(text, candidate_labels=None):
-    if candidate_labels is None:
-        candidate_labels = [
-            "legal", "technical", "creative", "finance", "sports", "health",
-            "politics", "education", "entertainment", "business"
-        ]
-    chunks = chunk_text(text, max_chunk_size=512)
-    label_counts = {}
-    for ch in chunks:
-        result = zero_shot_classifier(ch, candidate_labels)
-        top_label = result["labels"][0]
-        label_counts[top_label] = label_counts.get(top_label, 0) + 1
-    sorted_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
-    top_labels = [lbl for (lbl, _) in sorted_labels[:3]]
-    return top_labels
-def do_ocr_on_image(image_bytes):
-    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    return pytesseract.image_to_string(image)
-def is_page_scanned(page_text):
-    return not page_text or len(page_text.strip()) < 20
-def extract_text_from_pdf(pdf_file) -> str:
-    reader = PyPDF2.PdfReader(pdf_file)
-    all_text = []
-    for page_index, page in enumerate(reader.pages):
-        extracted = page.extract_text()
-        if not extracted or is_page_scanned(extracted):
-            try:
-                writer = PyPDF2.PdfWriter()
-                writer.add_page(page)
-                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
-                    writer.write(temp_pdf)
-                    temp_pdf_path = temp_pdf.name
-                from pdf2image import convert_from_path
-                images = convert_from_path(temp_pdf_path)
-                if images:
-                    ocr_text = pytesseract.image_to_string(images[0])
-                    all_text.append(ocr_text)
-                else:
-                    all_text.append("")
-                os.remove(temp_pdf_path)
-            except Exception as e:
-                all_text.append(f"[OCR Error on page {page_index + 1}]: {e}")
-        if extracted:
-            all_text.append(extracted)
-    return "\n".join(all_text)
-def parse_pdf(file_obj):
-    return extract_text_from_pdf(file_obj)
-def parse_json(file_obj):
-    raw = file_obj.read()
-    import json
-    data = json.loads(raw)
-    return json.dumps(data, indent=2)
-def parse_txt(file_obj):
-    return file_obj.read().decode("utf-8", errors="ignore")
-def parse_xml(file_obj):
-    raw = file_obj.read()
-    return raw.decode("utf-8", errors="ignore")
-def parse_image(file_obj):
-    image_bytes = file_obj.read()
-    return do_ocr_on_image(image_bytes)
-def get_file_extension(filename):
-    return filename.split(".")[-1].lower()
 ###############################################################################
-#                    3. Data Structures & In-Memory Store                     #
 ###############################################################################
-SESSIONS = {}
-def create_session():
-    return str(uuid.uuid4())
-###############################################################################
-#                    4. Multi-File Upload and Analysis                        #
-###############################################################################
-def load_files(files, session_id):
-    if session_id not in SESSIONS:
-        SESSIONS[session_id] = {"files": {}, "chat_history": []}
-    results = []
-    for f in files:
-        ext = get_file_extension(f.name)
-        try:
-            if ext == "pdf":
-                content = parse_pdf(f)
-            elif ext in ["png", "jpg", "jpeg", "bmp", "tiff"]:
-                content = parse_image(f)
-            elif ext == "json":
-                content = parse_json(f)
-            elif ext == "xml":
-                content = parse_xml(f)
-            elif ext == "txt":
-                content = parse_txt(f)
-            else:
-                content = parse_txt(f)
-            summary = chunk_summarize(content) if content.strip() else ""
-            topics = do_topic_detection(content) if content.strip() else []
-            pages_text = []
-            if ext == "pdf":
-                f.seek(0)
-                reader = PyPDF2.PdfReader(f)
-                for idx, page in enumerate(reader.pages):
-                    ptext = page.extract_text() or ""
-                    pages_text.append(ptext)
-            else:
-                pages_text.append(content)
-            total_words = len(content.split())
-            total_tokens = approximate_tokens(content)
-            SESSIONS[session_id]["files"][f.name] = {
-                "ext": ext,
-                "content": content,
-                "summary": summary,
-                "topics": topics,
-                "pages": pages_text,
-                "stats": {"words": total_words, "tokens": total_tokens}
-            }
-            result_str = f"**File:** {f.name}\n - Words: {total_words}, Tokens: {total_tokens}\n - Topics: {topics}\n - Summary: {summary[:200]}..."
-            results.append(result_str)
-        except Exception as e:
-            results.append(f"Error loading {f.name}: {e}")
-    return "\n\n".join(results)
-def show_file_insights(session_id):
-    if session_id not in SESSIONS or not SESSIONS[session_id]["files"]:
-        return "No files uploaded yet."
-    msg = []
-    for fname, data in SESSIONS[session_id]["files"].items():
-        msg.append(f"**{fname}**")
-        msg.append(f" - Topics: {data['topics']}")
-        msg.append(f" - Word Count: {data['stats']['words']}, Token Count: {data['stats']['tokens']}")
-        msg.append(f" - Summary: {data['summary'][:300]}...")
-        msg.append("-----")
-    return "\n".join(msg)
-def kill_session(session_id):
-    if session_id in SESSIONS:
-        del SESSIONS[session_id]
-    return "Session data cleared."
-###############################################################################
-#                    5. Reference Finder (Page-Based)                         #
-###############################################################################
-def find_reference(session_id, query):
-    if session_id not in SESSIONS:
-        return "No session."
-    results = []
-    for fname, data in SESSIONS[session_id]["files"].items():
-        pages = data["pages"]
-        for i, ptext in enumerate(pages):
-            if query.lower() in ptext.lower():
-                idx = ptext.lower().find(query.lower())
-                snippet = ptext[max(0, idx-50): idx+len(query)+50]
-                results.append(f"{fname} (page {i+1}): ...{snippet}...")
-    if not results:
-        return "No references found."
-    return "\n\n".join(results)
-###############################################################################
-#                    6. Q&A with Chunk-Based Retrieval                        #
-###############################################################################
-def retrieve_relevant_chunks(session_id, question):
-    if session_id not in SESSIONS:
-        return []
-    text_blocks = []
-    for fname, data in SESSIONS[session_id]["files"].items():
-        chs = chunk_text(data["content"], max_chunk_size=400)
-        for ch in chs:
-            text_blocks.append((fname, ch))
-    question_words = set(question.lower().split())
-    block_scores = []
-    for (fname, block) in text_blocks:
-        block_words = set(block.lower().split())
-        score = len(question_words.intersection(block_words))
-        block_scores.append((score, fname, block))
-    block_scores.sort(key=lambda x: x[0], reverse=True)
-    top_chunks = [bc for bc in block_scores[:3] if bc[0] > 0]
-    return top_chunks
-def answer_question(session_id, question):
-    top_chunks = retrieve_relevant_chunks(session_id, question)
-    if not top_chunks:
-        return "No relevant chunks found in the uploaded files."
-    answers = []
-    for score, fname, block in top_chunks:
-        result = qa_pipeline({"question": question, "context": block})
-        answers.append((result["score"], result["answer"], fname))
-    answers.sort(key=lambda x: x[0], reverse=True)
-    best = answers[0]
-    return f"**Answer:** {best[1]} (confidence={best[0]:.2f}, from file={best[2]})"
-###############################################################################
-#                           7. Chat-Like Interface                            #
-###############################################################################
-def chat(user_input, chat_history, session_id):
-    if session_id not in SESSIONS:
-        SESSIONS[session_id] = {"files": {}, "chat_history": []}
-    # If the user wants to search for a reference:
-    if user_input.lower().startswith("ref:"):
-        query = user_input[4:].strip()
-        result = find_reference(session_id, query)
-        chat_history.append({"role": "assistant", "content": result})
-        return "", chat_history
-    # Process the question using QA:
-    answer = answer_question(session_id, user_input)
-    question_tokens = approximate_tokens(user_input)
-    answer_tokens = approximate_tokens(answer)
-    usage_str = f"Tokens: Q={question_tokens}, A={answer_tokens}, Total={question_tokens + answer_tokens}"
-    full_answer = f"{answer}\n\n({usage_str})"
-    chat_history.append({"role": "assistant", "content": full_answer})
-    return "", chat_history
-###############################################################################
-#                           8. Voice Integration (STT Only)                   #
-###############################################################################
-def transcribe_audio(audio):
-    if audio is None:
-        return ""
-    filepath = audio
-    import torchaudio
-    speech_array, sampling_rate = torchaudio.load(filepath)
-    inputs = whisper_processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt")
-    with torch.no_grad():
-        generated_ids = whisper_model.generate(**inputs)
-    transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return transcription.strip()
-###############################################################################
-#                           9. Gradio Interface                               #
-###############################################################################
-def reset_session():
-    sid = create_session()
-    return sid, "New session created."
 with gr.Blocks() as demo:
-    gr.Markdown("# **All-in-One Local File QA + OCR + Summaries + Topics + Voice (STT Only)**")
-    session_id = gr.State(create_session())
-    with gr.Column():
-        gr.Markdown("### 1. File Upload & Analysis")
-        file_uploader = gr.File(file_count="multiple", label="Upload your files (PDF, images, TXT, JSON, XML)")
-        upload_btn = gr.Button("Process Files")
-        upload_output = gr.Markdown()
-        def on_upload(files, sid):
-            return load_files(files, sid)
-        upload_btn.click(on_upload, inputs=[file_uploader, session_id], outputs=upload_output)
-        insights_btn = gr.Button("Show File Insights")
-        insights_output = gr.Markdown()
-        insights_btn.click(fn=show_file_insights, inputs=[session_id], outputs=insights_output)
-        kill_btn = gr.Button("Kill Session")
-        kill_msg = gr.Markdown()
-        kill_btn.click(fn=kill_session, inputs=[session_id], outputs=kill_msg)
-        new_session_btn = gr.Button("Reset Session")
-        new_session_out = gr.Markdown()
-        new_session_btn.click(fn=reset_session, outputs=[session_id, new_session_out])
-    gr.Markdown("### 2. Voice Input (STT Only)")
-    audio_in = gr.Audio(type="filepath", label="Speak your question")
-    stt_btn = gr.Button("Transcribe")
-    stt_output = gr.Textbox(label="Transcribed Text")
-    stt_btn.click(fn=transcribe_audio, inputs=[audio_in], outputs=[stt_output])
-    gr.Markdown("### 3. Chat / Q&A (Enter text below)")
-    # Set type="messages" for openai-style chat messages
-    chatbot = gr.Chatbot(label="Chat History", type="messages")
-    user_input = gr.Textbox(label="Your question (or 'ref: <term>' for reference search)", lines=2)
-    send_btn = gr.Button("Send")
-    def user_message(user_msg, history):
-        history = history + [{"role": "user", "content": user_msg}]
-        return "", history
-    send_btn.click(fn=user_message, inputs=[user_input, chatbot], outputs=[user_input, chatbot], queue=False)
-    def bot_message(history, sid):
-        if not history:
-            return []
-        # The most recent message should be from the user.
-        user_msg = history[-1]["content"]
-        _, updated_history = chat(user_msg, history, sid)
-        return updated_history
-    send_btn.click(fn=bot_message, inputs=[chatbot, session_id], outputs=[chatbot])
-demo.queue().launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+from huggingface_hub import InferenceClient
+import nltk
+import json
 import io
 import base64
+from fpdf import FPDF
+from textblob import TextBlob
 nltk.download("punkt", quiet=True)
 ###############################################################################
+#                           Hugging Face Chat Code                            #
+###############################################################################
+"""
+For more information on `huggingface_hub` Inference API support, please check:
+https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
+"""
+# Initialize your Hugging Face model client
+client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    system_message,
+    max_tokens,
+    temperature,
+    top_p
+):
+    """
+    Streams the chat response from the Hugging Face model.
+    Yields tokens as they arrive, so Gradio can display partial responses.
+    """
+    # Build the messages to send to the model
+    messages = [{"role": "system", "content": system_message}]
+    for val in history:
+        if val[0]:
+            messages.append({"role": "user", "content": val[0]})
+        if val[1]:
+            messages.append({"role": "assistant", "content": val[1]})
+    messages.append({"role": "user", "content": message})
+    # Streaming response
+    response = ""
+    for partial in client.chat_completion(
+        messages,
+        max_tokens=max_tokens,
+        stream=True,
+        temperature=temperature,
+        top_p=top_p,
+    ):
+        token = partial.choices[0].delta.get("content", "")
+        response += token
+        yield response
+###############################################################################
+#                       Advanced Text Converter Code                          #
+###############################################################################
+def text_to_sentences(text: str):
+    """Splits the text into sentences using nltk."""
+    return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]
+def generate_comments(sentences):
+    """
+    Generates AI-based comments for each sentence using TextBlob
+    sentiment polarity as a simple demonstration.
+    """
+    comments = []
+    for sentence in sentences:
+        polarity = TextBlob(sentence).sentiment.polarity
+        # A simple "AI Insight" comment
+        comment = f"AI Insight: Polarity={polarity:.2f} for sentence: '{sentence}'"
+        comments.append(comment)
+    return comments
+def convert_to_json(sentences, comments):
+    """Creates a JSON structure where each sentence has a comment."""
+    data = [{"sentence": s, "comment": c} for s, c in zip(sentences, comments)]
+    return json.dumps({"sentences": data}, indent=2)
+def convert_to_pdf(sentences, comments):
+    """Creates a PDF where each sentence is listed with a comment."""
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.set_font("Arial", size=12)
+    for s, c in zip(sentences, comments):
+        pdf.multi_cell(0, 10, f"Sentence: {s}", 0, 1)
+        pdf.multi_cell(0, 10, c, 0, 1)
+        pdf.ln(5)
+    pdf_buffer = io.BytesIO()
+    pdf.output(pdf_buffer, 'F')
+    pdf_buffer.seek(0)
+    return pdf_buffer
+def process_text(user_text, output_format):
+    """
+    Main function triggered by the Gradio interface.
+    Returns either JSON text or a PDF file (as bytes).
+    """
+    if not user_text.strip():
+        return "Error: Please provide non-empty text!", None
+    sentences = text_to_sentences(user_text)
+    comments = generate_comments(sentences)
+    if output_format == "JSON":
+        # Return JSON text, no file
+        json_data = convert_to_json(sentences, comments)
+        return json_data, None
     else:
+        # Return PDF as bytes, no text
+        pdf_buffer = convert_to_pdf(sentences, comments)
+        # Gradio expects a tuple: (file_name, file_bytes)
+        return None, ("output.pdf", pdf_buffer.getvalue())
 ###############################################################################
+#                              Gradio UI Layout                               #
 ###############################################################################
 with gr.Blocks() as demo:
+    gr.Markdown("# **Combined Gradio App**")
+    gr.Markdown(
+        """
+Welcome! This app has **two main tabs**:
+1. **AI Chat**: A streaming chat interface with a Hugging Face model.
+2. **Advanced Text Converter**: Convert text to JSON or PDF with AI-based sentiment comments.
+"""
+    )
+    with gr.Tabs():
+        # =========== TAB 1: AI Chat ===========
+        with gr.Tab("AI Chat"):
+            # We can simply use Gradio's ChatInterface for streaming responses
+            gr.Markdown("### Chat with a Hugging Face Model")
+            chat = gr.ChatInterface(
+                fn=respond,
+                additional_inputs=[
+                    gr.Textbox(
+                        value="You are a helpful AI assistant.",
+                        label="System message",
+                    ),
+                    gr.Slider(
+                        minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"
+                    ),
+                    gr.Slider(
+                        minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"
+                    ),
+                    gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.95,
+                        step=0.05,
+                        label="Top-p (nucleus sampling)",
+                    ),
+                ],
+            )
+        # =========== TAB 2: Text Converter ===========
+        with gr.Tab("Advanced Text Converter"):
+            gr.Markdown("### Convert text to JSON or PDF with AI comments")
+            input_text = gr.Textbox(
+                label="Enter your text (or paste from a file)",
+                placeholder="Type or paste your text here...",
+                lines=10,
+            )
+            format_dropdown = gr.Dropdown(
+                choices=["JSON", "PDF"],
+                value="JSON",
+                label="Choose output format",
+            )
+            convert_button = gr.Button("Convert")
+            # Two possible outputs: either JSON text or a PDF file
+            output_json = gr.Code(
+                label="JSON Output",
+                language="json",
+                visible=True,
+            )
+            output_file = gr.File(label="PDF Download")
+            def run_conversion(text, fmt):
+                """
+                Helper function to connect with Gradio.
+                Returns either a JSON string or a PDF file handle.
+                """
+                json_str, pdf_file = process_text(text, fmt)
+                # If we got an error or JSON
+                if isinstance(json_str, str) and json_str.startswith("Error:"):
+                    return json_str, None
+                if fmt == "JSON":
+                    # Show JSON in the code area, no file
+                    return json_str, None
+                else:
+                    # Return no text, but a file
+                    return None, pdf_file
+            convert_button.click(
+                fn=run_conversion,
+                inputs=[input_text, format_dropdown],
+                outputs=[output_json, output_file],
+            )
+# Launch the Gradio app
+if __name__ == "__main__":
+    demo.launch()