File size: 4,971 Bytes

ffcb401

# © 2025 Elena Marziali — Code released under Apache 2.0 license.
# See LICENSE in the repository for details.
# Removal of this copyright is prohibited.

# Evaluate the structure of the AI response from the LLM
def validate_ai_structure(response, expected_fields=("title", "abstract", "url")):
    if not isinstance(response, list):
        return []
    valid_items = []
    for item in response:
        if isinstance(item, dict) and all(k in item for k in expected_fields):
            valid_items.append(item)
    return valid_items

import math

# Compute semantic score of the response
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def evaluate_score(model_output):
    try:
        score = float(model_output[0])
        return round(sigmoid(score), 3)
    except:
        return 0.0

# Extract text from selected file
def extract_text(file_name, max_chars=5000):
    """
    Extracts text from supported formats (.pdf, .docx, .tsv, .csv).
    Returns only the first max_chars characters.
    """
    extension = file_name.lower().split(".")[-1]

    try:
        if extension == "pdf":
            with pdfplumber.open(file_name) as pdf:
                text = "\n".join([p.extract_text() or "" for p in pdf.pages]).strip()

        elif extension == "docx":
            doc = Document(file_name)
            text = "\n".join([p.text for p in doc.paragraphs]).strip()

        elif extension in ["csv", "tsv"]:
            sep = "," if extension == "csv" else "\t"
            df = pd.read_csv(file_name, sep=sep)
            text = df.to_string(index=False)

        else:
            raise ValueError(f"Unsupported format: .{extension}")

        return text[:max_chars] if text else "No text extracted."

    except Exception as e:
        return f"Error during text extraction: {e}"

# Safely extract textual content from an AIMessage
def extract_text_from_ai(obj):
    """ Safely extracts textual content from an AIMessage object. """
    return getattr(obj, "content", str(obj)).strip()

# Extract figure captions from text
def extract_captions_from_text(text):
    pattern = r"(Figure|Fig\.?)\s*\d+[:\.\-–]?\s*[^\n]+"
    return re.findall(pattern, text, re.IGNORECASE)

# Extract images and captions from a file
def extract_images_with_captions(file_path, output_folder="extracted_figures"):
    os.makedirs(output_folder, exist_ok=True)
    extension = file_path.lower().split(".")[-1]
    images = []
    captions = []

    try:
        if extension == "pdf":
            doc = fitz.open(file_path)
            full_text = "\n".join([p.get_text("text") for p in doc])
            extracted_captions = extract_captions_from_text(full_text)
            count = 0

            for i, page in enumerate(doc):
                for j, img in enumerate(page.get_images(full=True)):
                    base = doc.extract_image(img[0])
                    ext = base["ext"]
                    path = f"{output_folder}/page{i+1}_img{j+1}.{ext}"
                    with open(path, "wb") as f:
                        f.write(base["image"])
                    images.append(path)
                    captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}.{j+1}")
                    count += 1

        elif extension == "docx":
            doc = Document(file_path)
            text = "\n".join([p.text for p in doc.paragraphs])
            extracted_captions = extract_captions_from_text(text)
            count = 0

            for i, rel in enumerate(doc.part._rels):
                relation = doc.part._rels[rel]
                if "image" in relation.target_ref:
                    img_data = relation.target_part.blob
                    name = f"{output_folder}/docx_image_{i+1}.png"
                    with open(name, "wb") as f:
                        f.write(img_data)
                    images.append(name)
                    captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}")
                    count += 1

        else:
            print(f"Unsupported extension: .{extension}")

        print(f"{len(images)} image(s) extracted.")
        return images, captions

    except Exception as e:
        print(f"Error extracting images: {e}")
        return [], []

# Generate semantic coherence note based on score
def generate_note(score):
    if score > 0.85:
        return "High semantic coherence. The response is likely solid and relevant."
    elif score > 0.6:
        return "Moderate coherence. The response is understandable but may contain approximations."
    else:
        return "Low coherence. It may be helpful to rephrase the question or provide more context."

# Simulate LLM response generation
def generate_response(question, temperature=0.7):
    if "Rephrase" in question:
        return "How does enthalpy change during a phase transition?"
    return f"[Simulated response at temperature {temperature} for: {question}]"