Spaces:

bahakizil
/

Transcript_Creater

Sleeping

App Files Files Community

bahakizil commited on Jan 31, 2025

Commit

33f677a

verified ·

1 Parent(s): 461b947

Update app.py

Browse files

Files changed (1) hide show

app.py +240 -179

app.py CHANGED Viewed

@@ -1,213 +1,274 @@
 import os
 import gradio as gr
-import tiktoken
-import docx
-import PyPDF2
-#######################################
-# 1) MODEL YÜKLEME
-#######################################
-# Hugging Face Spaces'de barındırılan bir modeli "gr.load" ile çağırabilirsiniz.
-# Örn: model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
-model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
-def call_model(prompt: str) -> str:
     """
-    Model arayüzünü (model_iface) tek satırda çağırarak sonuç döndürür.
     """
-    result = model_iface(prompt)
-    if isinstance(result, str):
-        return result
-    return str(result)
-#######################################
-# 2) DOSYA OKUMA (PDF/DOCX/TXT)
-#######################################
-def read_file_to_text(file_obj) -> str:
     """
-    file_obj: gradio'dan gelen dosya (pdf/docx/txt).
-    Returns: metin (str)
     """
-    if file_obj is None:
-        return ""
-    file_path = file_obj.name
-    # Uzantı kontrolü
-    _, ext = os.path.splitext(file_path)
-    ext = ext.lower()
-    if ext == ".pdf":
-        return read_pdf(file_path)
-    elif ext == ".docx":
-        return read_docx(file_path)
-    elif ext == ".txt":
-        return read_txt(file_path)
-    else:
-        # Bilinmeyen format - basitçe hata ya da boş dönebilir
-        return ""
 def read_pdf(file_path: str) -> str:
     text = ""
     with open(file_path, "rb") as f:
-        reader = PyPDF2.PdfReader(f)
         for page in reader.pages:
-            text += page.extract_text() + "\n"
     return text
 def read_docx(file_path: str) -> str:
-    doc = docx.Document(file_path)
-    full_text = []
     for para in doc.paragraphs:
-        full_text.append(para.text)
-    return "\n".join(full_text)
 def read_txt(file_path: str) -> str:
     with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
         return f.read()
-#######################################
-# 3) TIKTOKEN CHUNK
-#######################################
-def chunk_text_with_tiktoken(text: str, chunk_size=500, model_name="gpt-3.5-turbo"):
     """
-    text'i 'chunk_size' token uzunluklarında parçalara böler (token bazlı).
     """
-    encoding = tiktoken.encoding_for_model(model_name)
-    tokens = encoding.encode(text)
-    chunks = []
-    for i in range(0, len(tokens), chunk_size):
-        sub_tokens = tokens[i:i+chunk_size]
-        chunk_str = encoding.decode(sub_tokens)
-        chunks.append(chunk_str)
-    return chunks
-#######################################
-# 4) 11 CHUNK: 4 HEADING + 3 VALIDATION
-#######################################
-def generate_4_headings_3_validation(full_text: str) -> str:
     """
-    4 heading (her heading 2 chunk: üretici + kontrol = 8) + 3 validation = 11 chunk
     """
-    final_output = ""
-    # ========== HEADING 1 ==========
-    # 1) üretici
-    h1_prod = call_model(
-        f"[HEADING 1 PRODUCTION]\n"
-        f"Input:\n{full_text}\n"
-        "Task: 'Heading 1: Introductory overview' with 3000-6000 chars."
-    )
-    # 2) kontrol
-    h1_ctrl = call_model(
-        f"[HEADING 1 CONTROL]\n"
-        f"H1 Production:\n{h1_prod}\n"
-        "Check 3000-6000 chars, fix if needed."
-    )
-    final_output += f"<b>HEADING 1 (Corrected)</b><hr>\n{h1_ctrl}\n\n"
-    # ========== HEADING 2 ==========
-    # 3) üretici
-    h2_prod = call_model(
-        f"[HEADING 2 PRODUCTION]\n"
-        f"Input:\n{full_text}\n"
-        "Task: 'Heading 2: Detailed explanation of common risks' with 500-1200 chars."
-    )
-    # 4) kontrol
-    h2_ctrl = call_model(
-        f"[HEADING 2 CONTROL]\n"
-        f"H2 Production:\n{h2_prod}\n"
-        "Check 500-1200 chars, fix if needed."
-    )
-    final_output += f"<b>HEADING 2 (Corrected)</b><hr>\n{h2_ctrl}\n\n"
-    # ========== HEADING 3 ==========
-    # 5) üretici
-    h3_prod = call_model(
-        f"[HEADING 3 PRODUCTION]\n"
-        f"Input:\n{full_text}\n"
-        "Task: 'Heading 3: Practical examples and solutions' with 500-1200 chars."
-    )
-    # 6) kontrol
-    h3_ctrl = call_model(
-        f"[HEADING 3 CONTROL]\n"
-        f"H3 Production:\n{h3_prod}\n"
-        "Check 500-1200 chars, fix if needed."
-    )
-    final_output += f"<b>HEADING 3 (Corrected)</b><hr>\n{h3_ctrl}\n\n"
-    # ========== HEADING 4 ==========
-    # 7) üretici
-    h4_prod = call_model(
-        f"[HEADING 4 PRODUCTION]\n"
-        f"Input:\n{full_text}\n"
-        "Task: 'Heading 4: Summary and next steps for students' with 500-1200 chars."
-    )
-    # 8) kontrol
-    h4_ctrl = call_model(
-        f"[HEADING 4 CONTROL]\n"
-        f"H4 Production:\n{h4_prod}\n"
-        "Check 500-1200 chars, fix if needed."
-    )
-    final_output += f"<b>HEADING 4 (Corrected)</b><hr>\n{h4_ctrl}\n\n"
-    # ========== 3 VALIDATION CHUNK ==========
-    current_text = final_output
-    for i in range(1, 4):
-        validation_out = call_model(
-            f"[VALIDATION #{i}]\n"
-            f"Current text:\n{current_text}\n"
-            "Check headings' constraints. If fixes needed, do them. Otherwise 'No changes needed.'"
-        )
-        current_text = validation_out
-    return current_text
-#######################################
-# 5) GRADIO ARAYÜZ FONKSİYONU
-#######################################
-def main_interface(file, manual_text, chunk_size):
-    """
-    file: Yüklenen dosya (PDF/DOCX/TXT)
-    manual_text: Kullanıcının girdiği ham metin
-    chunk_size: Tiktoken chunk uzunluğu
-    """
-    # 1) Dosya varsa, ondan metin çekelim
-    doc_text = read_file_to_text(file)
-    # 2) Metni oluştur -> file metni + manual_text
-    combined_text = (doc_text + "\n" + manual_text).strip()
-    if not combined_text:
-        return "No input text found."
-    # 3) Tiktoken chunk
-    chunks = chunk_text_with_tiktoken(combined_text, chunk_size=chunk_size)
-    # 4) Tüm chunk'ları birleştirip (veya isterseniz parça parça da işleyebilirsiniz),
-    #    11-chunk mantığına sokalım
-    full_text = "\n".join(chunks)
-    final_output = generate_4_headings_3_validation(full_text)
-    return final_output.replace("\n", "<br>")
-#######################################
-# 6) GRADIO ARAYÜZ TANIMI
-#######################################
-demo = gr.Interface(
-    fn=main_interface,
-    inputs=[
-        gr.File(label="Upload PDF/DOCX/TXT (optional)"),
-        gr.Textbox(lines=5, label="Or Paste Some Text"),
-        gr.Slider(minimum=100, maximum=2000, step=100, value=500, label="Chunk Size (tokens)")
-    ],
-    outputs="html",
-    title="PDF/DOCX + Tiktoken + 4 Heading + 3 Validation (11 Chunk)"
-)
-def run():
-    demo.launch()
 if __name__ == "__main__":
-    run()

+# app.py
+# --------------------------------------------------------------------------------
+# Bu kod, tamamen geliştirici (insan) tarafından, öğretici ve eğitim amacıyla
+# yazılmıştır. GPT-4o-mini modelini kullanarak 4 başlık + 1 kontrol chunk (5 chunk)
+# şeklinde metin oluşturma akışını gösterir. Minimum 4000, maksimum 10000 kelime
+# üretilmesi hedeflenir. Kod, Gradio ile görsel bir arayüz sunar.
+#
+# NOT: Lütfen 'YOUR_API_KEY_HERE' kısmına kendi OpenAI API anahtarınızı ekleyin.
+# Bu kodda max_tokens 10,000, temperature 0.8 kullanarak uzun ve yaratıcı çıktılar
+# elde etmeyi amaçlıyoruz.
+#
+# Bu proje tamamen insan emeğiyle yazılmıştır, geliştirici tarafından tasarlanmıştır.
+# --------------------------------------------------------------------------------
 import os
+import re
 import gradio as gr
+# Ek kütüphaneler
+try:
+    from openai import OpenAI
+    import tiktoken
+    from PyPDF2 import PdfReader
+    from docx import Document
+except ImportError:
+    raise ImportError("Lütfen 'openai', 'tiktoken', 'gradio', 'PyPDF2', 'python-docx' paketlerini kurun.")
+# -------------------------- OpenAI Ayarları --------------------------
+# GPT-4o-mini modelini kullanacağımız API istemcisi:
+client = OpenAI(api_key="YOUR_API_KEY_HERE")
+def call_openai_chat(messages, max_tokens=10000, temperature=0.8):
     """
+    GPT-4o-mini modeline istek atar.
+    - max_tokens=10000 -> uzun metinler
+    - temperature=0.8  -> daha yaratıcı/uzun anlatımlar
     """
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        stop=None  # Erken kesmeyi kapatalım
+    )
+    return response.choices[0].message.content
+# ------------------------- Chunk Mantığı -------------------------
+def heading1_part1(input_text):
     """
+    Chunk #1 -> Heading 1'in ilk parçası.
+    Kullanıcıdan alınan metin ile kısmi bir "Introductory overview" üretir.
     """
+    user_content = f"""
+We have some input text. We want the first part of 'Heading 1: Introductory overview of input'.
+Please produce a partial text focusing on an introduction (about 1000+ words).
+Do NOT finalize heading 1 yet, just a partial introduction.
+Input text:
+{input_text}
+"""
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant generating partial text for heading #1."},
+        {"role": "user", "content": user_content}
+    ]
+    return call_openai_chat(messages)
+def heading1_part2(h1_part1_text):
+    """
+    Chunk #2 -> Heading 1'in ikinci parçası.
+    Ilk parçayı genişleterek final haline getirir (örn. 2000+ words).
+    """
+    user_content = f"""
+Below is the partial text for heading 1:
+{h1_part1_text}
+Now finalize heading 1 by merging expansions or clarifications.
+Ensure heading 1 is at least 2000 words in total. Add depth and examples.
+Return only the final text for heading 1.
+"""
+    messages = [
+        {"role": "system", "content": "You are finalizing heading #1."},
+        {"role": "user", "content": user_content}
+    ]
+    return call_openai_chat(messages)
+def single_heading_chunk(existing_text, heading_title):
+    """
+    Chunk #3 veya #4 -> Heading 2 veya Heading 3. Tek seferde ~1000 kelime oluşturmayı hedefleyelim.
+    existing_text: heading1_text, vs. referans olarak kullanılabilir.
+    """
+    user_content = f"""
+We have some text for context (heading1 or previous content).
+Please produce a new heading: '{heading_title}' with around 1000+ words if possible.
+Do not produce final expansions for other headings.
+Context:
+{existing_text}
+"""
+    messages = [
+        {"role": "system", "content": "You are generating a single-chunk heading text."},
+        {"role": "user", "content": user_content}
+    ]
+    return call_openai_chat(messages)
+def heading4_and_expansions(heading1_text, heading2_text, heading3_text, input_text):
+    """
+    Chunk #5 -> Heading 4, expansions if total <4000 words, or shorten if >10000 words.
+    Tek seferde final text döndürür.
+    """
+    user_prompt = f"""
+We have 3 headings so far:
+[Heading 1]
+{heading1_text}
+[Heading 2]
+{heading2_text}
+[Heading 3]
+(Will be produced next, or we have it if created)
+Actually, produce Heading 4: 'Summary and next steps for students.'
+Then combine headings 1,2,3,4 into one final text.
+If the entire text (4 headings) is under 4000 words, expand or add content
+to any heading until we reach 4000+ words.
+If above 10000 words, shorten while keeping crucial details.
+Return the final text with headings 1,2,3,4 merged.
+No separate block, but unify expansions or edits.
+You can also use original input context:
+{input_text}
+"""
+    messages = [
+        {"role": "system", "content": "You are finalizing heading #4 and ensuring total word count 4000-10000."},
+        {"role": "user", "content": user_prompt}
+    ]
+    return call_openai_chat(messages)
+# -------------------- Dosya Okuma Yardımcı Fonksiyonlar --------------------
 def read_pdf(file_path: str) -> str:
+    """Reads text from a PDF file (simple approach)."""
     text = ""
     with open(file_path, "rb") as f:
+        reader = PdfReader(f)
         for page in reader.pages:
+            page_txt = page.extract_text()
+            if page_txt:
+                text += page_txt
     return text
 def read_docx(file_path: str) -> str:
+    """Reads text from a DOCX file."""
+    doc = Document(file_path)
+    paragraphs = []
     for para in doc.paragraphs:
+        paragraphs.append(para.text)
+    return "\n".join(paragraphs)
 def read_txt(file_path: str) -> str:
+    """Reads text from a .txt file."""
     with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
         return f.read()
+# --------------- Gradio Arayüz Fonksiyonları ---------------
+def process_input_text_or_file(txt_input, file_obj):
     """
+    Okunan metni döndürür.
+    txt_input: text (str)
+    file_obj: gradio üzerinden gelen file nesnesi
     """
+    # Eğer dosya yüklenmişse
+    if file_obj is not None:
+        # file_obj genelde (name, size, data vb.) barındırır.
+        file_name = file_obj.name
+        content = file_obj.read()  # raw bytes
+        with open(file_name, "wb") as tmp:
+            tmp.write(content)
+        ext = file_name.lower().split(".")[-1]
+        if ext == "pdf":
+            return read_pdf(file_name)
+        elif ext == "docx":
+            return read_docx(file_name)
+        elif ext == "txt":
+            return read_txt(file_name)
+        else:
+            # fallback decode
+            return content.decode("utf-8", errors="ignore")
+    else:
+        # Dosya yoksa, metin kutusunu döndür
+        return txt_input.strip()
+def generate_5_chunks(input_txt):
     """
+    1) Heading1 part1 (chunk #1)
+    2) Heading1 part2 (chunk #2)
+    3) Heading2 (chunk #3)
+    4) Heading3 (chunk #4)
+    5) Heading4 + expansions => final text (chunk #5)
     """
+    # Chunk #1: heading1 part1
+    h1_part1 = heading1_part1(input_txt)
+    # Chunk #2: heading1 part2 => finalize heading 1
+    heading1_final = heading1_part2(h1_part1)
+    # Chunk #3: heading2
+    heading2_final = single_heading_chunk(heading1_final, "Heading 2: Detailed explanation of common risks.")
+    # Chunk #4: heading3
+    heading3_final = single_heading_chunk(heading1_final, "Heading 3: Practical examples and solutions.")
+    # Chunk #5: heading4 + expansions
+    final_text = heading4_and_expansions(heading1_final, heading2_final, heading3_final, input_txt)
+    # HTML için .replace
+    final_html = final_text.replace("\n", "<br>")
+    # Kelime sayısı
+    plain_text = re.sub(r"<.*?>", "", final_text)
+    wcount = len(plain_text.split())
+    # Sonuç
+    info = f"✅ Done. The final text is approx {wcount} words."
+    return final_html, info
+def gradio_interface(txt_input, file_upload):
+    # Tek fonksiyon, hem input hem output
+    read_content = process_input_text_or_file(txt_input, file_upload)
+    if not read_content:
+        return "⚠️ Please provide text or file input.", ""
+    # 5-chunk workflow
+    final_html, info = generate_5_chunks(read_content)
+    return final_html, info
+# --------------- Gradio Demo ---------------
+def build_gradio_app():
+    # "inputs" parametresine, txt ve file girişi ekleyeceğiz
+    text_input = gr.Textbox(
+        lines=5,
+        label="Text Input (Optional)",
+        placeholder="Enter some text or upload a file..."
+    )
+    file_input = gr.File(
+        label="Upload File (PDF/DOCX/TXT)",
+        file_types=[".pdf", ".docx", ".txt"],
+        optional=True
+    )
+    # outputs: final HTML + info
+    output_html = gr.HTML(label="Generated Output (Min 4000 words, Max 10000 words)")
+    info_label = gr.Label(label="Process Info (Word Count etc.)")
+    # Arayüz
+    demo = gr.Interface(
+        fn=gradio_interface,
+        inputs=[text_input, file_input],
+        outputs=[output_html, info_label],
+        title="5-Chunks GPT-4o-mini (4000-10000 words) Example",
+        description=(
+            "A demonstration of chunk-based approach with GPT-4o-mini model. "
+            "We produce 4 headings: "
+            "Heading1(part1+part2), Heading2, Heading3, and then Heading4 & expansions "
+            "if total words < 4000 or shorten if > 10000."
+            "\n(Coded by a human developer, not AI. For educational purposes.)"
+        )
+    )
+    return demo
+# app.py main
 if __name__ == "__main__":
+    # Gradio app
+    demo_app = build_gradio_app()
+    # genelde local (127.0.0.1:7860) host
+    demo_app.launch()