-- db/schema.sql
--
-- RAG content schema for AI Engineering homework
--
-- How to run:
--   Ensure your DATABASE_URL is set, then run:
--     psql "$DATABASE_URL" -f db/schema.sql
--
--   Or, with explicit connection params:
--     psql -h localhost -p 5432 -U your_user -d your_db -f db/schema.sql
--
-- Notes:
-- - Uses pgvector for embedding similarity search.
-- - Embeddings are 1536-dimensional (adjust if you use a different model).
-- - Some metadata fields (language, genre, tags, published_date, etc.)
--   are included as NULLABLE to satisfy the homework rubric even if
--   they are not used yet by the application.

------------------------------------------------------------
-- 1) Enable pgvector extension
------------------------------------------------------------
CREATE EXTENSION IF NOT EXISTS vector;

-- Optional: set default schema
SET search_path TO public;

------------------------------------------------------------
-- 2) RAG content table
------------------------------------------------------------
CREATE TABLE IF NOT EXISTS rag_content (
    -- Core identifier
    id              BIGSERIAL PRIMARY KEY,

    -- Core content + embedding
    content         TEXT         NOT NULL,         -- the text chunk
    embedding       VECTOR(1536) NOT NULL,         -- dimension must match your embedding model

    --------------------------------------------------------
    -- Book / document metadata (some may be unused for now)
    --------------------------------------------------------
    title           TEXT,
    author          TEXT,

    -- Either isbn or book_id can be used as the primary book identifier.
    isbn            TEXT,
    book_id         TEXT,

    chapter         TEXT,
    chapter_num     INTEGER,
    section         TEXT,
    page_start      INTEGER,       -- nullable (optional use)
    page_end        INTEGER,       -- nullable (optional use)
    language        TEXT,          -- nullable (optional use)
    genre           TEXT,          -- nullable (optional use)
    tags            TEXT[],        -- nullable (optional use; GIN index below)
    edition         TEXT,          -- nullable (optional use)
    published_date  DATE,          -- nullable (optional use)
    source_path     TEXT,          -- nullable (optional use)
    source_id       TEXT,          -- nullable (optional use)
    rights          TEXT,          -- nullable (optional use)
    license         TEXT,          -- nullable (optional use)

    --------------------------------------------------------
    -- Chunk metadata
    --------------------------------------------------------
    chunk_id            TEXT,      -- logical chunk id (nullable)
    chunk_index         INTEGER,   -- position of chunk in book (nullable)
    chunk_token_count   INTEGER,   -- token count for chunk (nullable)

    --------------------------------------------------------
    -- Embedding metadata
    --------------------------------------------------------
    embedding_model TEXT    NOT NULL,              -- e.g. 'text-embedding-3-small'
    embedding_dim   INTEGER NOT NULL DEFAULT 1536, -- should match VECTOR dimension above
    checksum        TEXT NOT NULL UNIQUE,

    --------------------------------------------------------
    -- Bookkeeping
    --------------------------------------------------------
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now(),

    --------------------------------------------------------
    -- Optional multi-tenant / app-specific fields
    --------------------------------------------------------
    user_id         INTEGER,
    document_type   TEXT,
    document_id     TEXT,
    username        TEXT,

    --------------------------------------------------------
    -- Constraints
    --------------------------------------------------------
    CONSTRAINT rag_content_ck_embedding_dim
        CHECK (embedding_dim = 1536),

    CONSTRAINT rag_content_ck_page_range
        CHECK (
          page_start IS NULL
          OR page_end   IS NULL
          OR page_start <= page_end
        ),

    CONSTRAINT rag_content_ck_published_date
        CHECK (
          published_date IS NULL
          OR published_date >= DATE '1400-01-01'
          OR published_date <= DATE '2100-01-01'
        )
);

------------------------------------------------------------
-- 3) Indexes
------------------------------------------------------------

-- Vector index for similarity search
-- ivfflat is widely supported; adjust lists as needed.
CREATE INDEX IF NOT EXISTS idx_rag_content_embedding_ivfflat
    ON rag_content
    USING ivfflat (embedding vector_cosine_ops)
    WITH (lists = 100);

-- B-tree indexes for common filters
CREATE INDEX IF NOT EXISTS idx_rag_content_author
    ON rag_content (author);

CREATE INDEX IF NOT EXISTS idx_rag_content_language
    ON rag_content (language);

CREATE INDEX IF NOT EXISTS idx_rag_content_isbn_book_id
    ON rag_content (isbn, book_id);

CREATE INDEX IF NOT EXISTS idx_rag_content_published_date
    ON rag_content (published_date);

-- GIN index for tags[]
CREATE INDEX IF NOT EXISTS idx_rag_content_tags_gin
    ON rag_content
    USING GIN (tags);
