-- db/migration.sql
--
-- Migration from original rag_content schema:
--   id TEXT PRIMARY KEY,
--   embedding VECTOR(1536) NOT NULL,
--   context TEXT NOT NULL,
--   user_id INTEGER NOT NULL,
--   document_type TEXT NOT NULL,
--   document_id TEXT,
--   username TEXT
--
-- to the new schema defined in db/schema.sql.
--
-- How to run:
--   psql "$DATABASE_URL" -f db/migration.sql

BEGIN;

------------------------------------------------------------
-- 1) Ensure pgvector exists
------------------------------------------------------------
CREATE EXTENSION IF NOT EXISTS vector;

------------------------------------------------------------
-- 2) Rename old table
------------------------------------------------------------
ALTER TABLE rag_content RENAME TO rag_content_old;

------------------------------------------------------------
-- 3) Create new rag_content table (matches db/schema.sql)
------------------------------------------------------------
CREATE TABLE rag_content (
    -- Core identifier (new, auto-generated)
    id              BIGSERIAL PRIMARY KEY,

    -- Core content + embedding
    content         TEXT         NOT NULL,
    embedding       VECTOR(1536) NOT NULL,

    --------------------------------------------------------
    -- Book / document metadata (nullable / optional)
    --------------------------------------------------------
    title           TEXT,
    author          TEXT,

    isbn            TEXT,
    book_id         TEXT,

    chapter         TEXT,
    chapter_num     INTEGER,
    section         TEXT,
    page_start      INTEGER,
    page_end        INTEGER,
    language        TEXT,
    genre           TEXT,
    tags            TEXT[],
    edition         TEXT,
    published_date  DATE,
    source_path     TEXT,
    source_id       TEXT,
    rights          TEXT,
    license         TEXT,

    --------------------------------------------------------
    -- Chunk metadata
    --------------------------------------------------------
    chunk_id            TEXT,
    chunk_index         INTEGER,
    chunk_token_count   INTEGER,

    --------------------------------------------------------
    -- Embedding metadata
    --------------------------------------------------------
    embedding_model TEXT    NOT NULL,
    embedding_dim   INTEGER NOT NULL DEFAULT 1536,
    checksum        TEXT    NOT NULL,

    --------------------------------------------------------
    -- Bookkeeping
    --------------------------------------------------------
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now(),

    --------------------------------------------------------
    -- Optional multi-tenant / app-specific fields
    --------------------------------------------------------
    user_id         INTEGER,
    document_type   TEXT,
    document_id     TEXT,
    username        TEXT,

    --------------------------------------------------------
    -- Constraints
    --------------------------------------------------------
    CONSTRAINT rag_content_ck_embedding_dim
        CHECK (embedding_dim = 1536),

    CONSTRAINT rag_content_ck_page_range
        CHECK (
          page_start IS NULL
          OR page_end   IS NULL
          OR page_start <= page_end
        ),

    CONSTRAINT rag_content_ck_published_date
        CHECK (
          published_date IS NULL
          OR published_date >= DATE '1400-01-01'
          OR published_date <= DATE '2100-01-01'
        )
);

------------------------------------------------------------
-- 3b) Add idempotency constraint
------------------------------------------------------------
ALTER TABLE rag_content
ADD CONSTRAINT rag_content_checksum_uniq UNIQUE (checksum);

------------------------------------------------------------
-- 4) Copy data from old table into new table
------------------------------------------------------------
-- Note:
-- - We DO NOT insert into id (BIGSERIAL) so it auto-generates.
-- - We map:
--     context   -> content
--     embedding -> embedding
--     user_id, document_type, document_id, username -> same fields
-- - All other metadata columns are left NULL for now.
-- - Adjust 'text-embedding-3-small' if using a different model name.

INSERT INTO rag_content (
    content,
    embedding,
    user_id,
    document_type,
    document_id,
    username,
    embedding_model,
    embedding_dim,
    checksum
)
SELECT
    context                           AS content,
    embedding,
    user_id,
    document_type,
    document_id,
    username,
    'text-embedding-3-small'          AS embedding_model,
    1536                              AS embedding_dim,
    md5(context || COALESCE(document_id, '')) AS checksum
FROM rag_content_old;

------------------------------------------------------------
-- 5) Indexes (same as db/schema.sql)
------------------------------------------------------------

-- Vector index for similarity search
CREATE INDEX IF NOT EXISTS idx_rag_content_embedding_ivfflat
    ON rag_content
    USING ivfflat (embedding vector_cosine_ops)
    WITH (lists = 100);

-- B-tree indexes for common filters
CREATE INDEX IF NOT EXISTS idx_rag_content_author
    ON rag_content (author);

CREATE INDEX IF NOT EXISTS idx_rag_content_language
    ON rag_content (language);

CREATE INDEX IF NOT EXISTS idx_rag_content_isbn_book_id
    ON rag_content (isbn, book_id);

CREATE INDEX IF NOT EXISTS idx_rag_content_published_date
    ON rag_content (published_date);

-- GIN index for tags[]
CREATE INDEX IF NOT EXISTS idx_rag_content_tags_gin
    ON rag_content
    USING GIN (tags);

COMMIT;

-- Optional cleanup once you're confident migration succeeded:
-- DROP TABLE rag_content_old;
