Metadata Filtering | Captain Docs

Metadata filtering lets you attach structured key-value pairs to your files at index time, then narrow search results at query time using those fields. This is useful for scoping searches by department, date range, access level, content type, or any other dimension relevant to your data.

Attaching Metadata at Index Time

Pass custom_metadata on any indexing endpoint. Every chunk from that file inherits the metadata.

Plain text

1 import requests
2 
3 BASE_URL = "https://api.runcaptain.com"
4 headers = {
5     "Authorization": f"Bearer {API_KEY}",
6     "Content-Type": "application/json"
7 }
8 
9 response = requests.post(
10     f"{BASE_URL}/v2/collections/my_collection/index/text",
11     headers=headers,
12     json={
13         "content": "Q4 2024 revenue grew 23% year-over-year to $4.2B...",
14         "filename": "q4-2024-earnings.txt",
15         "custom_metadata": {
16             "department": "finance",
17             "year": 2024,
18             "quarter": "Q4",
19             "is_public": False
20         }
21     }
22 )

1 const response = await fetch(
2   `${BASE_URL}/v2/collections/my_collection/index/text`,
3   {
4     method: "POST",
5     headers: {
6       "Authorization": `Bearer ${API_KEY}`,
7       "Content-Type": "application/json"
8     },
9     body: JSON.stringify({
10       content: "Q4 2024 revenue grew 23% year-over-year to $4.2B...",
11       filename: "q4-2024-earnings.txt",
12       custom_metadata: {
13         department: "finance",
14         year: 2024,
15         quarter: "Q4",
16         is_public: false
17       }
18     })
19   }
20 );

Cloud storage (S3, GCS, Azure, R2)

Metadata applies to all files in the indexing job.

1 response = requests.post(
2     f"{BASE_URL}/v2/collections/my_collection/index/s3",
3     headers=headers,
4     json={
5         "bucket_name": "company-docs",
6         "aws_access_key_id": "AKIAIOSFODNN7EXAMPLE",
7         "aws_secret_access_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
8         "bucket_region": "us-east-1",
9         "processing_type": "advanced",
10         "custom_metadata": {
11             "source": "s3",
12             "department": "engineering",
13             "confidentiality": "internal"
14         }
15     }
16 )

Supported value types

Type	Example
String	`"department": "legal"`
Integer	`"year": 2024`
Float	`"confidence": 0.95`
Boolean	`"is_public": true`
String array	`"tags": ["earnings", "quarterly"]`

Filtering at Query Time

Pass metadata_filter in the query request body to restrict results to chunks matching your criteria.

1 response = requests.post(
2     f"{BASE_URL}/v2/collections/my_collection/query",
3     headers=headers,
4     json={
5         "query": "What were the key revenue drivers?",
6         "inference": False,
7         "top_k": 10,
8         "metadata_filter": {
9             "department": "finance",
10             "year": {"$gte": 2024}
11         }
12     }
13 )

1 const response = await fetch(
2   `${BASE_URL}/v2/collections/my_collection/query`,
3   {
4     method: "POST",
5     headers: {
6       "Authorization": `Bearer ${API_KEY}`,
7       "Content-Type": "application/json"
8     },
9     body: JSON.stringify({
10       query: "What were the key revenue drivers?",
11       inference: false,
12       top_k: 10,
13       metadata_filter: {
14         department: "finance",
15         year: { $gte: 2024 }
16       }
17     })
18   }
19 );

Filters work with both inference: true (AI-powered answers) and inference: false (raw search results).

Filter Operators

Operator	Description	Example
(bare value)	Equals	`{"department": "legal"}`
`$eq`	Equals (explicit)	`{"department": {"$eq": "legal"}}`
`$ne`	Not equals	`{"department": {"$ne": "hr"}}`
`$gt`	Greater than	`{"year": {"$gt": 2023}}`
`$gte`	Greater than or equal	`{"year": {"$gte": 2024}}`
`$lt`	Less than	`{"year": {"$lt": 2025}}`
`$lte`	Less than or equal	`{"year": {"$lte": 2024}}`
`$in`	In set	`{"department": {"$in": ["legal", "finance"]}}`
`$nin`	Not in set	`{"department": {"$nin": ["hr", "ops"]}}`

Combining Filters

Implicit AND

Multiple fields at the top level are combined with AND:

1 {
2   "metadata_filter": {
3     "department": "finance",
4     "year": {"$gte": 2024},
5     "is_public": false
6   }
7 }

This matches chunks where department is “finance” AND year is at least 2024 AND is_public is false.

Explicit OR

Use $or to match chunks that satisfy any of the conditions:

1 {
2   "metadata_filter": {
3     "$or": [
4       {"department": "legal"},
5       {"department": "finance"}
6     ]
7   }
8 }

Mixing AND and OR

Combine top-level AND with nested OR:

1 {
2   "metadata_filter": {
3     "year": {"$gte": 2024},
4     "$or": [
5       {"department": "legal"},
6       {"department": "finance"}
7     ]
8   }
9 }

This matches chunks from 2024 or later in either the legal or finance department.

Full Example

Index documents with metadata, then query with filters:

1 import requests
2 
3 BASE_URL = "https://api.runcaptain.com"
4 API_KEY = "your_api_key"
5 COLLECTION = "company_docs"
6 
7 headers = {
8     "Authorization": f"Bearer {API_KEY}",
9     "Content-Type": "application/json"
10 }
11 
12 # 1. Index with metadata
13 requests.post(
14     f"{BASE_URL}/v2/collections/{COLLECTION}/index/text",
15     headers=headers,
16     json={
17         "content": "The board approved a 15% increase in R&D spending for fiscal year 2025...",
18         "filename": "board-minutes-2025.txt",
19         "custom_metadata": {
20             "department": "executive",
21             "year": 2025,
22             "document_type": "minutes",
23             "is_public": False
24         }
25     }
26 )
27 
28 # 2. Query with filters
29 response = requests.post(
30     f"{BASE_URL}/v2/collections/{COLLECTION}/query",
31     headers=headers,
32     json={
33         "query": "R&D budget decisions",
34         "inference": False,
35         "top_k": 5,
36         "metadata_filter": {
37             "department": {"$in": ["executive", "finance"]},
38             "year": {"$gte": 2024}
39         }
40     }
41 )
42 
43 for result in response.json().get("search_results", []):
44     print(f"  {result['filename']} (score: {result['score']:.3f})")