Source code for autorag.ingest

"""Document loading and chunking primitives for the RAG pipeline.

These functions form the boundary between filesystem inputs (text
files, PDFs, audio clips) and the structured
:class:`~autorag.schemas.Document` / :class:`~autorag.schemas.Chunk`
shapes consumed by the embedder and vector store.

The current implementations are stubs that raise
:class:`NotImplementedError`; concrete loaders are wired up via
:class:`autorag.core.AutoRAG`.
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from pathlib import Path

    from autorag.schemas import Chunk, Document


[docs] def load_documents(paths: list[str | Path]) -> list[Document]: """Load text documents from disk into :class:`~autorag.schemas.Document` records.""" raise NotImplementedError
[docs] def load_audio_clips(paths: list[str | Path]) -> list[dict[str, Any]]: """Load audio clip metadata for transcript-based ingestion.""" raise NotImplementedError
[docs] def chunk_document(doc: Document, chunk_size: int, chunk_overlap: int) -> list[Chunk]: """Split a document into overlapping :class:`~autorag.schemas.Chunk` records.""" raise NotImplementedError