# MindWiki — robots policy
#
# Goal: maximum discoverability for public marketing, docs, blog,
# comparison, glossary, and API-reference pages — across both
# traditional web crawlers and the growing set of AI search /
# retrieval / training crawlers. Private app surfaces (signed-in
# account, the Next.js internal /api routes, OAuth flows, and the
# /login + /signup screens) are off-limits to every crawler so they
# never end up in indexes or AI training sets.
#
# Implementation notes:
#   - Per the robots.txt spec, named user-agent groups do NOT inherit
#     directives from `*`. Each named group repeats the same private
#     route disallows on purpose.
#   - "Allow: /" is explicit so crawlers that look for it (some
#     legacy parsers) don't interpret the disallow list as a global
#     block.
#   - The Sitemap directive at the bottom is the canonical pointer
#     to /sitemap.xml — auto-generated by Next.js from the route
#     registry plus docs/api-reference/blog manifests.
#
# References used to build this policy:
#   - OpenAI:      https://platform.openai.com/docs/bots
#   - Anthropic:   https://support.anthropic.com/en/articles/8896518
#   - Perplexity:  https://docs.perplexity.ai/guides/bots
#   - Google:      https://developers.google.com/search/docs/crawling-indexing/google-extended
#   - Apple:       https://support.apple.com/en-us/119829
#   - Common Crawl: https://commoncrawl.org/faq

# ─── Default policy — applies to every crawler not named below ────
User-agent: *
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# ═══════════════════════════════════════════════════════════════════
# AI ANSWER ENGINES — real-time retrieval for chat / search answers
# These crawlers fetch pages at query time when a user asks a
# question. Allowing them is the single highest-leverage signal we
# can send for AI-driven discovery.
# ═══════════════════════════════════════════════════════════════════

# OpenAI ChatGPT Search bot
User-agent: OAI-SearchBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# OpenAI ChatGPT in-chat fetcher (when a user asks ChatGPT to read a URL)
User-agent: ChatGPT-User
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Anthropic Claude search bot (real-time citations in answers)
User-agent: Claude-SearchBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Anthropic Claude in-chat fetcher
User-agent: Claude-User
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Perplexity search bot
User-agent: PerplexityBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Perplexity in-chat fetcher
User-agent: Perplexity-User
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Mistral in-chat fetcher
User-agent: MistralAI-User
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# You.com search bot
User-agent: YouBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Cohere bot
User-agent: cohere-ai
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# ═══════════════════════════════════════════════════════════════════
# TRADITIONAL SEARCH ENGINES + AI-features overlays
# These are the bots that own Google/Bing/DuckDuckGo/etc. results.
# ═══════════════════════════════════════════════════════════════════

# Googlebot — Google Search crawling and indexing. This is the
# crawler that decides what shows up in Google Search results.
User-agent: Googlebot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Google-Extended — a separate opt-in/opt-out signal Google offers
# for SOME of its generative AI products (notably Gemini API
# grounding and Vertex AI). Per Google's own crawler docs,
# Google-Extended does NOT affect Google Search inclusion, ranking,
# or AI Overview eligibility — those are controlled by Googlebot.
# We allow it because we're comfortable being used as grounding
# context in those products; flip to `Disallow: /` if that ever
# changes.
User-agent: Google-Extended
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# GoogleOther — Google's general-purpose internal crawler for
# non-Search product research and one-off fetches. Not the Search
# bot and not Google-Extended; documented separately by Google.
User-agent: GoogleOther
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Bing / Microsoft Search (powers Copilot's web grounding)
User-agent: Bingbot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# DuckDuckGo
User-agent: DuckDuckBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Brave Search
User-agent: Bravebot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Apple Search + Spotlight
User-agent: Applebot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Apple AI features signal (Apple Intelligence grounding opt-in)
User-agent: Applebot-Extended
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Yandex (Russian search; reaches international tech audiences too)
User-agent: YandexBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Meta link-preview fetcher (for shared MindWiki links in WhatsApp/FB/IG)
User-agent: facebookexternalhit
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Meta external agent (Llama and Meta AI ground-from-web)
User-agent: Meta-ExternalAgent
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

User-agent: Meta-ExternalFetcher
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Twitter / X card fetcher
User-agent: Twitterbot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# LinkedIn share fetcher
User-agent: LinkedInBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# ═══════════════════════════════════════════════════════════════════
# FOUNDATION-MODEL TRAINING CRAWLERS
# Allowed on public content only. Flip any of these to `Disallow: /`
# if a future policy change wants public content excluded from a
# specific training corpus. Same private-route exclusions apply
# universally.
# ═══════════════════════════════════════════════════════════════════

# OpenAI training bot
User-agent: GPTBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Anthropic training bot
User-agent: ClaudeBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Common Crawl (feeds many open-source and academic datasets)
User-agent: CCBot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# ByteDance (TikTok parent — runs its own LLM training)
User-agent: Bytespider
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Diffbot — third-party knowledge graph; powers a number of AI ground-truth tools
User-agent: Diffbot
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Omgili — research crawler
User-agent: omgili
Allow: /
Disallow: /account/
Disallow: /api/
Disallow: /oauth/
Disallow: /login
Disallow: /signup

# Sitemap — auto-generated by /src/app/sitemap.ts at build time
Sitemap: https://mindwiki.io/sitemap.xml