Spaces:

admin08077
/

aitr

Sleeping

App Files Files Community

admin08077 commited on Jan 14, 2025

Commit

0ca4cd3

verified ·

1 Parent(s): 42a9179

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -93

app.py CHANGED Viewed

@@ -63,19 +63,13 @@ encoding = tiktoken.get_encoding("cl100k_base")
 ###############################################################################
 def approximate_tokens(text: str) -> int:
-    # Return an approximate token count using the chosen tokenizer
     return len(encoding.encode(text))
 def chunk_text(text, max_chunk_size=1500):
-    """
-    Splits text into chunks of <= max_chunk_size tokens (approx).
-    We'll do a naive approach: break on sentence boundaries from nltk.
-    """
     sentences = nltk.sent_tokenize(text)
     chunks = []
     current_chunk = ""
     current_tokens = 0
     for sent in sentences:
         sent_tokens = approximate_tokens(sent)
         if current_tokens + sent_tokens <= max_chunk_size:
@@ -91,80 +85,53 @@ def chunk_text(text, max_chunk_size=1500):
     return chunks
 def chunk_summarize(text):
-    """
-    Summarize large text by chunking and then joining partial summaries.
-    """
-    chunks = chunk_text(text, max_chunk_size=600)  # 600 tokens ~ smaller chunk for BART
     summaries = []
     for ch in chunks:
-        # Summarize each chunk
         out = summarizer(ch, max_length=150, min_length=40, do_sample=False)
         summaries.append(out[0]["summary_text"])
-    # Optionally summarize the summaries again if needed
     combined = " ".join(summaries)
     if len(chunks) > 1:
-        # Summarize the combined result to get a final summary
         final = summarizer(combined, max_length=150, min_length=40, do_sample=False)
         return final[0]["summary_text"]
     else:
         return combined
 def do_topic_detection(text, candidate_labels=None):
-    """
-    Zero-shot classify the text. If no candidate_labels given, use a default.
-    """
     if candidate_labels is None:
         candidate_labels = [
-            "legal", "technical", "creative", "finance", "sports", "health", "politics",
-            "education", "entertainment", "business"
         ]
-    # We'll chunk the text to keep it from being too large
     chunks = chunk_text(text, max_chunk_size=512)
-    # We'll do a naive approach: classify each chunk, take the top label
     label_counts = {}
     for ch in chunks:
         result = zero_shot_classifier(ch, candidate_labels)
         top_label = result["labels"][0]
         label_counts[top_label] = label_counts.get(top_label, 0) + 1
-    # Return the top 3
     sorted_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
     top_labels = [lbl for (lbl, _) in sorted_labels[:3]]
     return top_labels
 def do_ocr_on_image(image_bytes):
-    """
-    OCR an image (bytes) using Tesseract.
-    """
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     return pytesseract.image_to_string(image)
 def is_page_scanned(page_text):
-    """If PyPDF2 doesn't return text or it's extremely short, we assume scanned."""
-    if not page_text or len(page_text.strip()) < 20:
-        return True
-    return False
 def extract_text_from_pdf(pdf_file) -> str:
-    """
-    Attempt to extract text from each page using PyPDF2.
-    If a page appears scanned, fallback to OCR using Tesseract.
-    """
     reader = PyPDF2.PdfReader(pdf_file)
     all_text = []
     for page_index, page in enumerate(reader.pages):
-        # Try native extraction
         extracted = page.extract_text()
         if not extracted or is_page_scanned(extracted):
-            # Convert page to image and apply OCR
             try:
-                # Extract single page as a separate PDF
                 writer = PyPDF2.PdfWriter()
                 writer.add_page(page)
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
                     writer.write(temp_pdf)
                     temp_pdf_path = temp_pdf.name
-                # Convert PDF to image using pdf2image
                 from pdf2image import convert_from_path
                 images = convert_from_path(temp_pdf_path)
                 if images:
@@ -196,7 +163,6 @@ def parse_xml(file_obj):
     return raw.decode("utf-8", errors="ignore")
 def parse_image(file_obj):
-    # OCR the image
     image_bytes = file_obj.read()
     return do_ocr_on_image(image_bytes)
@@ -206,26 +172,18 @@ def get_file_extension(filename):
 ###############################################################################
 #                    3. Data Structures & In-Memory Store                     #
 ###############################################################################
-# We'll store user sessions in a dict: session_id -> {files: {...}, chat_history: [...]}
 SESSIONS = {}
 def create_session():
     return str(uuid.uuid4())
 ###############################################################################
 #                    4. Multi-File Upload and Analysis                        #
 ###############################################################################
 def load_files(files, session_id):
-    """
-    1. For each file, parse the text,
-    2. Summarize it (chunk-based),
-    3. Detect topics,
-    4. Store page-by-page for reference search.
-    """
     if session_id not in SESSIONS:
         SESSIONS[session_id] = {"files": {}, "chat_history": []}
     results = []
     for f in files:
         ext = get_file_extension(f.name)
@@ -242,12 +200,8 @@ def load_files(files, session_id):
                 content = parse_txt(f)
             else:
                 content = parse_txt(f)
-            # Summarize
             summary = chunk_summarize(content) if content.strip() else ""
-            # Topics
             topics = do_topic_detection(content) if content.strip() else []
-            # Page-level split (for reference search)
             pages_text = []
             if ext == "pdf":
                 f.seek(0)
@@ -257,23 +211,16 @@ def load_files(files, session_id):
                     pages_text.append(ptext)
             else:
                 pages_text.append(content)
-            # Stats
             total_words = len(content.split())
             total_tokens = approximate_tokens(content)
             SESSIONS[session_id]["files"][f.name] = {
                 "ext": ext,
                 "content": content,
                 "summary": summary,
                 "topics": topics,
                 "pages": pages_text,
-                "stats": {
-                    "words": total_words,
-                    "tokens": total_tokens,
-                }
             }
             result_str = f"**File:** {f.name}\n - Words: {total_words}, Tokens: {total_tokens}\n - Topics: {topics}\n - Summary: {summary[:200]}..."
             results.append(result_str)
         except Exception as e:
@@ -300,11 +247,8 @@ def kill_session(session_id):
 ###############################################################################
 #                    5. Reference Finder (Page-Based)                         #
 ###############################################################################
 def find_reference(session_id, query):
-    """
-    Naively search each page (or full text for non-PDFs) for the query,
-    then return a snippet.
-    """
     if session_id not in SESSIONS:
         return "No session."
     results = []
@@ -322,11 +266,8 @@ def find_reference(session_id, query):
 ###############################################################################
 #                    6. Q&A with Chunk-Based Retrieval                        #
 ###############################################################################
 def retrieve_relevant_chunks(session_id, question):
-    """
-    Combine file contents, chunk them, and select the top chunks matching the question.
-    (A real implementation might use embeddings; this uses a naive approach.)
-    """
     if session_id not in SESSIONS:
         return []
     text_blocks = []
@@ -345,9 +286,6 @@ def retrieve_relevant_chunks(session_id, question):
     return top_chunks
 def answer_question(session_id, question):
-    """
-    Use chunk-based QA (with roberta-base-squad2) on the top relevant chunks and return the best answer.
-    """
     top_chunks = retrieve_relevant_chunks(session_id, question)
     if not top_chunks:
         return "No relevant chunks found in the uploaded files."
@@ -362,11 +300,8 @@ def answer_question(session_id, question):
 ###############################################################################
 #                           7. Chat-Like Interface                            #
 ###############################################################################
 def chat(user_input, chat_history, session_id):
-    """
-    Append the user query to the chat history, run QA,
-    and append the response. Displays approximate token usage.
-    """
     if session_id not in SESSIONS:
         SESSIONS[session_id] = {"files": {}, "chat_history": []}
     if user_input.lower().startswith("ref:"):
@@ -385,10 +320,8 @@ def chat(user_input, chat_history, session_id):
 ###############################################################################
 #                           8. Voice Integration (STT Only)                   #
 ###############################################################################
 def transcribe_audio(audio):
-    """
-    Transcribe the uploaded audio using the local Whisper tiny model.
-    """
     if audio is None:
         return ""
     filepath = audio
@@ -411,53 +344,53 @@ def reset_session():
 with gr.Blocks() as demo:
     gr.Markdown("# **All-in-One Local File QA + OCR + Summaries + Topics + Voice (STT Only)**")
     session_id = gr.State(create_session())
     with gr.Column():
         gr.Markdown("### 1. File Upload & Analysis")
         file_uploader = gr.File(file_count="multiple", label="Upload your files (PDF, images, TXT, JSON, XML)")
         upload_btn = gr.Button("Process Files")
         upload_output = gr.Markdown()
         def on_upload(files, sid):
             return load_files(files, sid)
         upload_btn.click(on_upload, inputs=[file_uploader, session_id], outputs=upload_output)
         insights_btn = gr.Button("Show File Insights")
         insights_output = gr.Markdown()
         insights_btn.click(fn=show_file_insights, inputs=[session_id], outputs=insights_output)
         kill_btn = gr.Button("Kill Session")
         kill_msg = gr.Markdown()
         kill_btn.click(fn=kill_session, inputs=[session_id], outputs=kill_msg)
         new_session_btn = gr.Button("Reset Session")
         new_session_out = gr.Markdown()
         new_session_btn.click(fn=reset_session, outputs=[session_id, new_session_out])
     gr.Markdown("### 2. Voice Input (STT Only)")
-    # Removed the "source" argument as it is not supported in this version.
     audio_in = gr.Audio(type="filepath", label="Speak your question")
     stt_btn = gr.Button("Transcribe")
     stt_output = gr.Textbox(label="Transcribed Text")
     stt_btn.click(fn=transcribe_audio, inputs=[audio_in], outputs=[stt_output])
     gr.Markdown("### 3. Chat / Q&A (Enter text below)")
-    chatbot = gr.Chatbot(label="Chat History")
     user_input = gr.Textbox(label="Your question (or 'ref: <term>' for reference search)", lines=2)
     send_btn = gr.Button("Send")
     def user_message(user_msg, history):
         history = history + [[user_msg, None]]
         return "", history
     send_btn.click(fn=user_message, inputs=[user_input, chatbot], outputs=[user_input, chatbot], queue=False)
     def bot_message(history, sid):
         user_msg = history[-1][0]
-        _, updated_history = chat(user_msg, history[:-1], sid)
         return updated_history
     send_btn.click(fn=bot_message, inputs=[chatbot, session_id], outputs=[chatbot])
 demo.queue().launch(server_name="0.0.0.0", server_port=7860)

 ###############################################################################
 def approximate_tokens(text: str) -> int:
     return len(encoding.encode(text))
 def chunk_text(text, max_chunk_size=1500):
     sentences = nltk.sent_tokenize(text)
     chunks = []
     current_chunk = ""
     current_tokens = 0
     for sent in sentences:
         sent_tokens = approximate_tokens(sent)
         if current_tokens + sent_tokens <= max_chunk_size:
     return chunks
 def chunk_summarize(text):
+    chunks = chunk_text(text, max_chunk_size=600)
     summaries = []
     for ch in chunks:
         out = summarizer(ch, max_length=150, min_length=40, do_sample=False)
         summaries.append(out[0]["summary_text"])
     combined = " ".join(summaries)
     if len(chunks) > 1:
         final = summarizer(combined, max_length=150, min_length=40, do_sample=False)
         return final[0]["summary_text"]
     else:
         return combined
 def do_topic_detection(text, candidate_labels=None):
     if candidate_labels is None:
         candidate_labels = [
+            "legal", "technical", "creative", "finance", "sports", "health",
+            "politics", "education", "entertainment", "business"
         ]
     chunks = chunk_text(text, max_chunk_size=512)
     label_counts = {}
     for ch in chunks:
         result = zero_shot_classifier(ch, candidate_labels)
         top_label = result["labels"][0]
         label_counts[top_label] = label_counts.get(top_label, 0) + 1
     sorted_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
     top_labels = [lbl for (lbl, _) in sorted_labels[:3]]
     return top_labels
 def do_ocr_on_image(image_bytes):
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     return pytesseract.image_to_string(image)
 def is_page_scanned(page_text):
+    return not page_text or len(page_text.strip()) < 20
 def extract_text_from_pdf(pdf_file) -> str:
     reader = PyPDF2.PdfReader(pdf_file)
     all_text = []
     for page_index, page in enumerate(reader.pages):
         extracted = page.extract_text()
         if not extracted or is_page_scanned(extracted):
             try:
                 writer = PyPDF2.PdfWriter()
                 writer.add_page(page)
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
                     writer.write(temp_pdf)
                     temp_pdf_path = temp_pdf.name
                 from pdf2image import convert_from_path
                 images = convert_from_path(temp_pdf_path)
                 if images:
     return raw.decode("utf-8", errors="ignore")
 def parse_image(file_obj):
     image_bytes = file_obj.read()
     return do_ocr_on_image(image_bytes)
 ###############################################################################
 #                    3. Data Structures & In-Memory Store                     #
 ###############################################################################
 SESSIONS = {}
 def create_session():
     return str(uuid.uuid4())
 ###############################################################################
 #                    4. Multi-File Upload and Analysis                        #
 ###############################################################################
 def load_files(files, session_id):
     if session_id not in SESSIONS:
         SESSIONS[session_id] = {"files": {}, "chat_history": []}
     results = []
     for f in files:
         ext = get_file_extension(f.name)
                 content = parse_txt(f)
             else:
                 content = parse_txt(f)
             summary = chunk_summarize(content) if content.strip() else ""
             topics = do_topic_detection(content) if content.strip() else []
             pages_text = []
             if ext == "pdf":
                 f.seek(0)
                     pages_text.append(ptext)
             else:
                 pages_text.append(content)
             total_words = len(content.split())
             total_tokens = approximate_tokens(content)
             SESSIONS[session_id]["files"][f.name] = {
                 "ext": ext,
                 "content": content,
                 "summary": summary,
                 "topics": topics,
                 "pages": pages_text,
+                "stats": {"words": total_words, "tokens": total_tokens}
             }
             result_str = f"**File:** {f.name}\n - Words: {total_words}, Tokens: {total_tokens}\n - Topics: {topics}\n - Summary: {summary[:200]}..."
             results.append(result_str)
         except Exception as e:
 ###############################################################################
 #                    5. Reference Finder (Page-Based)                         #
 ###############################################################################
 def find_reference(session_id, query):
     if session_id not in SESSIONS:
         return "No session."
     results = []
 ###############################################################################
 #                    6. Q&A with Chunk-Based Retrieval                        #
 ###############################################################################
 def retrieve_relevant_chunks(session_id, question):
     if session_id not in SESSIONS:
         return []
     text_blocks = []
     return top_chunks
 def answer_question(session_id, question):
     top_chunks = retrieve_relevant_chunks(session_id, question)
     if not top_chunks:
         return "No relevant chunks found in the uploaded files."
 ###############################################################################
 #                           7. Chat-Like Interface                            #
 ###############################################################################
 def chat(user_input, chat_history, session_id):
     if session_id not in SESSIONS:
         SESSIONS[session_id] = {"files": {}, "chat_history": []}
     if user_input.lower().startswith("ref:"):
 ###############################################################################
 #                           8. Voice Integration (STT Only)                   #
 ###############################################################################
 def transcribe_audio(audio):
     if audio is None:
         return ""
     filepath = audio
 with gr.Blocks() as demo:
     gr.Markdown("# **All-in-One Local File QA + OCR + Summaries + Topics + Voice (STT Only)**")
     session_id = gr.State(create_session())
     with gr.Column():
         gr.Markdown("### 1. File Upload & Analysis")
         file_uploader = gr.File(file_count="multiple", label="Upload your files (PDF, images, TXT, JSON, XML)")
         upload_btn = gr.Button("Process Files")
         upload_output = gr.Markdown()
         def on_upload(files, sid):
             return load_files(files, sid)
         upload_btn.click(on_upload, inputs=[file_uploader, session_id], outputs=upload_output)
         insights_btn = gr.Button("Show File Insights")
         insights_output = gr.Markdown()
         insights_btn.click(fn=show_file_insights, inputs=[session_id], outputs=insights_output)
         kill_btn = gr.Button("Kill Session")
         kill_msg = gr.Markdown()
         kill_btn.click(fn=kill_session, inputs=[session_id], outputs=kill_msg)
         new_session_btn = gr.Button("Reset Session")
         new_session_out = gr.Markdown()
         new_session_btn.click(fn=reset_session, outputs=[session_id, new_session_out])
     gr.Markdown("### 2. Voice Input (STT Only)")
+    # Removed the 'source' parameter because it is not supported in this version.
     audio_in = gr.Audio(type="filepath", label="Speak your question")
     stt_btn = gr.Button("Transcribe")
     stt_output = gr.Textbox(label="Transcribed Text")
     stt_btn.click(fn=transcribe_audio, inputs=[audio_in], outputs=[stt_output])
     gr.Markdown("### 3. Chat / Q&A (Enter text below)")
+    chatbot = gr.Chatbot(label="Chat History", type="messages")
     user_input = gr.Textbox(label="Your question (or 'ref: <term>' for reference search)", lines=2)
     send_btn = gr.Button("Send")
     def user_message(user_msg, history):
         history = history + [[user_msg, None]]
         return "", history
     send_btn.click(fn=user_message, inputs=[user_input, chatbot], outputs=[user_input, chatbot], queue=False)
     def bot_message(history, sid):
+        # Check if history is empty
+        if not history:
+            return []
         user_msg = history[-1][0]
+        _, updated_history = chat(user_msg, history, sid)
         return updated_history
     send_btn.click(fn=bot_message, inputs=[chatbot, session_id], outputs=[chatbot])
 demo.queue().launch(server_name="0.0.0.0", server_port=7860)