"""
load_books.py

Python loader script for ingesting book paragraph chunks and embeddings into the
`rag_content` table (via Supabase), aligned with the AI Engineering homework
schema requirements.

This version parses book content directly from a Project Gutenberg-style HTML file
instead of JSON.

Responsibilities:
- Parse a Gutenberg HTML file into Book / Chapter / Paragraph objects.
- Build paragraph-level chunks with previous/next paragraph context.
- Generate embeddings using OpenAI.
- Compute metadata (chapter_num, chunk_index, token counts, checksum, etc.).
- Upsert into `rag_content` with all required fields populated.

See db/schema.sql and db/migration.sql for the expected database schema.
"""

import argparse
import hashlib
import os
from dataclasses import dataclass
from typing import List, Optional

from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai import OpenAI
from supabase import create_client, Client


# ---------------------------------------------------------------------------
# Configuration / constants
# ---------------------------------------------------------------------------

EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIM = 1536

DEFAULT_LANGUAGE = "en"
DEFAULT_TAGS = ["project-gutenberg", "public-domain"]
DEFAULT_GENRE: Optional[str] = None  # e.g. "fiction"

DEFAULT_USER_ID = 1
DEFAULT_DOCUMENT_TYPE = "book_paragraph"


# ---------------------------------------------------------------------------
# Data models
# ---------------------------------------------------------------------------

@dataclass
class Paragraph:
    num: int
    text: str
    context_prev: Optional[str]
    context_post: Optional[str]
    id: str


@dataclass
class Chapter:
    num: int
    title: str
    paragraphs: List[Paragraph]


@dataclass
class Book:
    id: str
    title: str
    author: str
    chapters: List[Chapter]


# ---------------------------------------------------------------------------
# Utility functions
# ---------------------------------------------------------------------------

def compute_checksum(content: str, book_id: Optional[str]) -> str:
    """
    Deterministic checksum for idempotency and schema field `checksum`.
    Uses md5(content + book_id).
    """
    h = hashlib.md5()
    h.update(content.encode("utf-8"))
    if book_id:
        h.update(book_id.encode("utf-8"))
    return h.hexdigest()


def normalize_text(text: str) -> str:
    """
    Normalize “smart quotes” and long dashes into plain ASCII equivalents.
    This helps avoid subtle differences in checksums and embeddings.
    """
    replacements = {
        "\u2018": "'",
        "\u2019": "'",
        "\u201c": '"',
        "\u201d": '"',
        "\u2013": "-",
        "\u2014": "-",
    }
    for bad, good in replacements.items():
        text = text.replace(bad, good)
    return text.strip()


def get_openai_client() -> OpenAI:
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY environment variable is not set")
    return OpenAI(api_key=api_key)


def get_supabase_client() -> Client:
    url = os.environ.get("SUPABASE_URL")
    key = os.environ.get("SUPABASE_KEY")

    if not url or not key:
        raise RuntimeError(
            "SUPABASE_URL and SUPABASE_KEY must be set in the environment"
        )

    return create_client(url, key)


def get_embedding(openai_client: OpenAI, text: str, model: str = EMBEDDING_MODEL) -> List[float]:
    """
    Generate an embedding for the given text using OpenAI's API.
    """
    resp = openai_client.embeddings.create(
        model=model,
        input=[text],
    )
    vec = resp.data[0].embedding

    if len(vec) != EMBEDDING_DIM:
        raise ValueError(
            f"Embedding dimension mismatch: expected {EMBEDDING_DIM}, got {len(vec)}"
        )

    return vec


# ---------------------------------------------------------------------------
# HTML parsing (Project Gutenberg-style)
# ---------------------------------------------------------------------------

def parse_book_from_html(path: str) -> Book:
    """
    Parse a Project Gutenberg-style HTML file into a Book object.

    Heuristics:
    - Use <meta name="dcterms.title"> or <title> for book title.
    - Use <meta name="dc.creator">, then <meta name="dcterms.creator">,
      then <meta name="author"> for author.
    - Use <meta name="dcterms.identifier"> or file basename as book_id.
    - Chapters are usually wrapped in <div class="chapter">; if not found,
      treat the entire document body as a single chapter.
    - Paragraphs are <p> elements; we capture previous/next paragraph text
      as context for each paragraph.
    """
    with open(path, "r", encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")

    # --- Book ID ---
    book_id_meta = soup.find("meta", attrs={"name": "dcterms.identifier"})
    if book_id_meta and book_id_meta.get("content"):
        book_id = book_id_meta["content"].strip()
    else:
        # fallback: use file stem as ID
        book_id = os.path.splitext(os.path.basename(path))[0]

    # --- Title ---
    title_meta = soup.find("meta", attrs={"name": "dcterms.title"})
    if title_meta and title_meta.get("content"):
        title = title_meta["content"].strip()
    elif soup.title and soup.title.string:
        title = soup.title.string.strip()
    else:
        title = book_id  # fallback

    # --- Author ---
    # Prefer dc.creator (as in many Gutenberg HTML files), then dcterms.creator, then generic author
    author_meta = (
        soup.find("meta", attrs={"name": "dc.creator"})
        or soup.find("meta", attrs={"name": "dcterms.creator"})
        or soup.find("meta", attrs={"name": "author"})
    )
    if author_meta and author_meta.get("content"):
        author = author_meta["content"].strip()
    else:
        author = "<UNKNOWN AUTHOR>"

    chapters: List[Chapter] = []

    # Prefer explicit chapter containers
    chapter_divs = soup.select("div.chapter")

    if chapter_divs:
        # Typical Gutenberg HTML with <div class="chapter">
        for ch_idx, div in enumerate(chapter_divs, start=1):
            # Title: handle cases like <h2><a id="..."></a>Title</h2>
            h = div.find(["h2", "h3"])
            ch_title = h.get_text(strip=True) if h else f"Chapter {ch_idx}"

            # Paragraphs inside this chapter: direct <p> children
            p_tags = div.find_all("p", recursive=False)
            paragraphs: List[Paragraph] = []

            for p_idx, p in enumerate(p_tags, start=1):
                text = normalize_text(p.get_text(" ", strip=True))
                if not text:
                    continue

                prev_text = (
                    normalize_text(p_tags[p_idx - 2].get_text(" ", strip=True))
                    if p_idx > 1
                    else None
                )
                next_text = (
                    normalize_text(p_tags[p_idx].get_text(" ", strip=True))
                    if p_idx < len(p_tags)
                    else None
                )

                para_id = f"{book_id}-ch{ch_idx}-p{p_idx}"

                paragraphs.append(
                    Paragraph(
                        num=p_idx,
                        text=text,
                        context_prev=prev_text,
                        context_post=next_text,
                        id=para_id,
                    )
                )

            chapters.append(Chapter(num=ch_idx, title=ch_title, paragraphs=paragraphs))
    else:
        # Fallback: treat entire body as one chapter
        body = soup.body or soup
        p_tags = body.find_all("p")
        paragraphs: List[Paragraph] = []

        for p_idx, p in enumerate(p_tags, start=1):
            text = normalize_text(p.get_text(" ", strip=True))
            if not text:
                continue

            prev_text = (
                normalize_text(p_tags[p_idx - 2].get_text(" ", strip=True))
                if p_idx > 1
                else None
            )
            next_text = (
                normalize_text(p_tags[p_idx].get_text(" ", strip=True))
                if p_idx < len(p_tags)
                else None
            )

            para_id = f"{book_id}-ch1-p{p_idx}"

            paragraphs.append(
                Paragraph(
                    num=p_idx,
                    text=text,
                    context_prev=prev_text,
                    context_post=next_text,
                    id=para_id,
                )
            )

        chapters.append(Chapter(num=1, title=title, paragraphs=paragraphs))

    return Book(id=book_id, title=title, author=author, chapters=chapters)


# ---------------------------------------------------------------------------
# DB writer
# ---------------------------------------------------------------------------

def load_vectors_into_supabase(
    supabase_client: Client,
    *,
    id: str,
    embedding: List[float],
    content: str,
    user_id: int,
    document_type: str,
    meta_doc: dict,
    source_path: Optional[str] = None,
):
    """
    Insert/Upsert a single paragraph chunk into rag_content, aligned with the
    schema described in db/schema.sql (chapter_num-aware).
    """
    book_title = meta_doc.get("book_title")
    book_author = meta_doc.get("book_author")
    chapter_title = meta_doc.get("chapter_title")
    chapter_number = meta_doc.get("chapter_number")  # chronological chapter number
    paragraph_number = meta_doc.get("paragraph_number")
    book_id = meta_doc.get("book_id")

    checksum = compute_checksum(content, book_id or book_title or id)
    embedding_dim = len(embedding)
    chunk_token_count = len(content.split())

    data = {
        # core content + embedding
        "content": content,
        "embedding": embedding,

        # book/doc metadata
        "title": book_title,
        "author": book_author,
        "book_id": book_id,
        "chapter": chapter_title,
        "chapter_num": chapter_number,
        # `section`, `isbn`, `edition`, `rights`, `license`, `published_date`,
        # `source_id`, `page_start`, `page_end` may be filled later if available.
        "language": DEFAULT_LANGUAGE,
        "genre": DEFAULT_GENRE,
        "tags": DEFAULT_TAGS,
        "source_path": source_path,

        # chunk metadata
        "chunk_id": id,
        "chunk_index": paragraph_number,
        "chunk_token_count": chunk_token_count,

        # embedding metadata
        "embedding_model": EMBEDDING_MODEL,
        "embedding_dim": embedding_dim,
        "checksum": checksum,

        # original multi-tenant-ish fields
        "user_id": user_id,
        "document_type": document_type,
        # "document_id": ...,  # optional
        # "username": ...,     # optional
    }

    try:
        supabase_client.table("rag_content").upsert(
            data,
            on_conflict="checksum",
        ).execute()
    except Exception as e:
        print(f"  ✗ Error inserting vectors {id}: {e}")


def insert_book_paragraphs(
    openai_client: OpenAI,
    supabase_client: Client,
    book: Book,
    source_path: str,
):
    """
    Insert a book's paragraphs as vector embeddings into the database.

    Creates embeddings for each paragraph (with context) and stores them
    in Supabase with associated metadata.
    """
    print(f"INFO: starting DB insert for paragraph chunks for book: {book.title}")

    for ch_idx, ch in enumerate(book.chapters, start=1):
        for p_idx, para in enumerate(ch.paragraphs, start=1):
            # Construct content using previous/current/next paragraph text
            parts = [para.context_prev, para.text, para.context_post]
            content = "\n\n".join(part for part in parts if part)

            meta_doc = {
                "book_title": book.title,
                "book_author": book.author,
                "chapter_title": ch.title,
                "chapter_number": ch.num,
                "paragraph_number": para.num,
                "book_id": book.id,
            }

            try:
                embedding = get_embedding(openai_client, content)
            except Exception as e:
                print(f"  ✗ Error generating embedding for {para.id}: {e}")
                continue

            load_vectors_into_supabase(
                supabase_client,
                id=para.id,
                embedding=embedding,
                content=content,
                user_id=DEFAULT_USER_ID,
                document_type=DEFAULT_DOCUMENT_TYPE,
                meta_doc=meta_doc,
                source_path=source_path,
            )

            if p_idx % 20 == 0:
                print(
                    f"INFO: processed {p_idx} / {len(ch.paragraphs)} paragraphs for chapter: {ch.title}"
                )

        if ch_idx % 5 == 0:
            print(
                f"INFO: processed {ch_idx} / {len(book.chapters)} chapters for book: {book.title}"
            )

    print(f"INFO: finished DB insert for book: {book.title}")


# ---------------------------------------------------------------------------
# CLI entrypoint
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Load a Gutenberg HTML book's paragraph embeddings into rag_content."
    )
    parser.add_argument(
        "--path",
        "-p",
        required=True,
        help="Path to the Project Gutenberg HTML file.",
    )
    args = parser.parse_args()

    # Load env variables from .env, then from the OS environment.
    load_dotenv()

    # Initialize clients
    openai_client = get_openai_client()
    supabase_client = get_supabase_client()

    # Parse book from HTML and insert
    book = parse_book_from_html(args.path)
    print(
        f"INFO: loaded book '{book.title}' by '{book.author}' with {len(book.chapters)} chapters from {args.path}"
    )

    insert_book_paragraphs(openai_client, supabase_client, book, source_path=args.path)

    print("INFO: done processing.")


if __name__ == "__main__":
    main()
