Commit Diff


commit - /dev/null
commit + d2898c58887876902d86df71013d58fdc187b9ee
blob - /dev/null
blob + 74bb1492d7c4e50ea3def3a724b49552221920ba (mode 644)
--- /dev/null
+++ README.md
@@ -0,0 +1,8 @@
+# auth-gate
+
+Inspired by
+https://her.esy.fun/posts/0031-how-i-protect-my-forgejo-instance-from-ai-web-crawlers/index.html
+
+Needs `{gcc,llvm}` and `{fast,slow}cgi`
+
+**DISCLAIMER**: Vibe-coded with an LLM. I'm not much of a programmer...
blob - /dev/null
blob + 8b09380d9eb2cd78abcc817ee69a162e10525727 (mode 644)
--- /dev/null
+++ auth-gate.c
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2025 Manuel Kuklinski
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * SPDX-License-Identifier: ISC
+ */
+
+/*
+ * auth-gate.c -- CGI access control for OpenBSD/slowcgi
+ *
+ * WHAT DOES THIS PROGRAM DO?
+ * --------------------------
+ * This program runs as a CGI script behind a web server (slowcgi on OpenBSD).
+ * When a browser requests a page, the web server starts this program and passes
+ * information about the HTTP request via environment variables (e.g. which URL
+ * was requested, which cookies were sent, etc.).
+ *
+ * The program checks whether the browser sends a specific cookie.
+ * - Cookie present -> serve the file from the document root directory.
+ * - Cookie missing -> send a JavaScript snippet that sets the cookie and
+ *                     reloads the page (the browser does this automatically).
+ *
+ * SECURITY MEASURES (OpenBSD-specific):
+ * - unveil(): restricts filesystem access to DOCROOT (read-only).
+ * - pledge(): restricts all permitted syscalls to stdio + file access.
+ * - realpath() + prefix check: prevents path traversal attacks.
+ * - O_NOFOLLOW: prevents symbolic links from being exploited.
+ *
+ * BUILD (OpenBSD):
+ *   cc -O2 -Wall -Wextra -o auth-gate auth-gate.c
+ */
+
+#include <stdio.h>    /* printf(), fread(), fwrite(), fclose() */
+#include <stdlib.h>   /* getenv(), strtol() */
+#include <string.h>   /* strlen(), strncmp(), strchr(), strstr(), strcmp() */
+#include <strings.h>  /* strcasecmp() -- case-insensitive comparison */
+#include <sys/stat.h> /* fstat(), struct stat, S_ISREG() */
+#include <unistd.h>   /* unveil(), pledge(), close() */
+#include <fcntl.h>    /* open(), O_RDONLY, O_NOFOLLOW */
+#include <limits.h>   /* PATH_MAX -- maximum path length of the OS */
+
+/* --- Configuration constants --------------------------------------------- */
+
+/* Directory from which files may be served.
+ * All requests must remain within this path. */
+#define DOCROOT "/htdocs/asdfghasdfgh.de"
+
+/* Name and expected value of the access cookie. */
+#define COOKIE_NAME  "Yogsototh_opens_the_door"
+#define COOKIE_VALUE "1"
+
+/* Buffer size for reading and sending files (8 KiB is sufficient). */
+#define BUF_SIZE 8192
+
+/* --- Helper functions ----------------------------------------------------- */
+
+/*
+ * url_decode -- decodes a percent-encoded URL string
+ *
+ * Background: many characters are not allowed directly in URLs.
+ * Instead they are encoded as %XX (XX = hexadecimal value).
+ * Example: %2F stands for '/', %20 for a space character.
+ *
+ * Parameters:
+ *   dst     -- destination buffer where the result is written
+ *   src     -- input URL (e.g. "/page%2Ftest")
+ *   dstsize -- size of the destination buffer (prevents buffer overflow)
+ *
+ * Security:
+ *   - Null bytes (val == 0x00) are rejected because they would terminate
+ *     C strings early and could bypass security checks.
+ *   - Control characters (val < 0x20) and DEL (0x7F) are rejected.
+ *   - Slashes (%2F) are passed through -- realpath() secures the path
+ *     reliably afterwards.
+ */
+static void url_decode(char *dst, const char *src, size_t dstsize) {
+    if (dstsize == 0)
+        return;
+
+    size_t i = 0; /* write position in the destination buffer */
+
+    while (*src && i < dstsize - 1) { /* -1: reserve space for terminating '\0' */
+        if (*src == '%' && src[1] && src[2]) {
+            /* The next two characters are the hex value of the encoded byte */
+            char hex[3] = { src[1], src[2], '\0' };
+            char *end;
+            long val = strtol(hex, &end, 16); /* hexadecimal -> integer */
+
+            /* Only accept valid, safe characters */
+            if (*end == '\0' && val > 0x1f && val != 0x7f)
+                dst[i++] = (char)val;
+
+            src += 3; /* skip '%' and both hex digits */
+        } else {
+            /* Copy regular character directly */
+            dst[i++] = *src++;
+        }
+    }
+    dst[i] = '\0'; /* properly terminate the C string */
+}
+
+/*
+ * sanitize_uri -- basic plausibility check of the request path
+ *
+ * This function catches obvious attack patterns early.
+ * The actual, reliable protection against path traversal is provided
+ * later by realpath() + prefix check.
+ *
+ * Rejected:
+ *   - Paths that do not start with '/' (invalid HTTP URIs)
+ *   - Paths containing "/../" or ending with "/.." (directory traversal)
+ *   - The bare string ".."
+ *
+ * Returns 0 if the path looks clean, -1 if suspicious.
+ */
+static int sanitize_uri(const char *uri) {
+    /* HTTP URIs must start with '/' */
+    if (uri[0] != '/')
+        return -1;
+
+    /* "/../" anywhere in the path, or bare ".." -> reject */
+    if (strstr(uri, "/../") || strcmp(uri, "..") == 0)
+        return -1;
+
+    /* Path ends with "/.." -> reject */
+    size_t len = strlen(uri);
+    if (len >= 3 && strcmp(uri + len - 3, "/..") == 0)
+        return -1;
+
+    return 0; /* path looks clean */
+}
+
+/*
+ * check_bypass -- checks whether the browser sends the access cookie
+ *
+ * CGI programs receive HTTP headers as environment variables.
+ * The "Cookie:" header is available in the HTTP_COOKIE variable, e.g.:
+ *   "session=abc; Yogsototh_opens_the_door=1; theme=dark"
+ *
+ * The function iterates over the semicolon-separated cookie pairs
+ * and looks for: COOKIE_NAME=COOKIE_VALUE
+ *
+ * Returns 1 if the cookie is present and correct, 0 otherwise.
+ */
+static int check_bypass(const char *cookie_hdr) {
+    if (!cookie_hdr)
+        return 0; /* no Cookie header -> no access */
+
+    /* Reject excessively long Cookie headers as a potential attack */
+    if (strlen(cookie_hdr) > 4096)
+        return 0;
+
+    const char *p = cookie_hdr;
+    size_t namelen = strlen(COOKIE_NAME);
+
+    /* Scan the cookie string left to right */
+    while (*p) {
+        /* Skip whitespace after semicolons */
+        while (*p == ' ') p++;
+
+        /* Check whether the current cookie matches our name */
+        if (strncmp(p, COOKIE_NAME, namelen) == 0 && p[namelen] == '=') {
+            p += namelen + 1; /* advance past '=' */
+            size_t vallen = strlen(COOKIE_VALUE);
+
+            /* Verify the value -- the character after it must be '\0', ';',
+             * or ' ' to prevent "Yogsototh_opens_the_door=10" from matching */
+            if (strncmp(p, COOKIE_VALUE, vallen) == 0 &&
+                (p[vallen] == '\0' || p[vallen] == ';' || p[vallen] == ' '))
+                return 1; /* cookie found and value matches */
+        }
+
+        /* Advance to the next cookie pair (past the next ';') */
+        while (*p && *p != ';') p++;
+        if (*p == ';') p++;
+    }
+
+    return 0; /* cookie not found or value incorrect */
+}
+
+/*
+ * get_mime_type -- determines the MIME type from the file extension
+ *
+ * Browsers need the MIME type to know how to handle a file
+ * (render HTML, display an image, open a PDF, etc.).
+ *
+ * Returns a constant string, e.g. "text/html; charset=utf-8".
+ * Falls back to "application/octet-stream" for unknown extensions
+ * (browsers treat this as an unknown binary file).
+ */
+static const char *get_mime_type(const char *path) {
+    /* strrchr finds the last '.' in the path -> file extension */
+    const char *ext = strrchr(path, '.');
+    if (!ext) return "application/octet-stream";
+    ext++; /* advance past the dot, e.g. "html" instead of ".html" */
+
+    /* Case-insensitive comparison (e.g. treat ".PNG" like ".png") */
+    if (strcasecmp(ext, "html") == 0 || strcasecmp(ext, "htm") == 0) return "text/html; charset=utf-8";
+    if (strcasecmp(ext, "css")  == 0) return "text/css";
+    if (strcasecmp(ext, "js")   == 0) return "application/javascript";
+    if (strcasecmp(ext, "json") == 0) return "application/json";
+    if (strcasecmp(ext, "png")  == 0) return "image/png";
+    if (strcasecmp(ext, "jpg")  == 0 ||
+        strcasecmp(ext, "jpeg") == 0) return "image/jpeg";
+    if (strcasecmp(ext, "gif")  == 0) return "image/gif";
+    if (strcasecmp(ext, "svg")  == 0) return "image/svg+xml";
+    if (strcasecmp(ext, "ico")  == 0) return "image/x-icon";
+    if (strcasecmp(ext, "txt")  == 0) return "text/plain; charset=utf-8";
+    if (strcasecmp(ext, "pdf")  == 0) return "application/pdf";
+    if (strcasecmp(ext, "xml")  == 0) return "application/xml";
+    if (strcasecmp(ext, "webp") == 0) return "image/webp";
+    if (strcasecmp(ext, "woff") == 0) return "font/woff";
+    if (strcasecmp(ext, "woff2")== 0) return "font/woff2";
+    return "application/octet-stream";
+}
+
+/*
+ * send_error -- sends an HTTP error response to the browser
+ *
+ * CGI programs communicate with the web server via stdout.
+ * The web server forwards the output to the browser.
+ *
+ * An HTTP response consists of:
+ *   1. Header lines (Status, Content-Type, ...)
+ *   2. A blank line (\r\n\r\n) as separator
+ *   3. The body (the actual content)
+ *
+ * Example for code=404, reason="Not Found":
+ *   Status: 404 Not Found\r\n
+ *   Content-Type: text/plain\r\n
+ *   X-Content-Type-Options: nosniff\r\n
+ *   Content-Length: 13\r\n
+ *   \r\n
+ *   404 Not Found
+ */
+static void send_error(int code, const char *reason) {
+    char body[64];
+    snprintf(body, sizeof(body), "%d %s", code, reason);
+
+    printf("Status: %d %s\r\n", code, reason);
+    printf("Content-Type: text/plain\r\n");
+    /* Prevents the browser from interpreting the content differently */
+    printf("X-Content-Type-Options: nosniff\r\n");
+    printf("Content-Length: %zu\r\n\r\n%s", strlen(body), body);
+}
+
+/* --- Main program --------------------------------------------------------- */
+
+int main(void) {
+    /*
+     * Read CGI environment variables.
+     * The web server populates these automatically before starting the CGI
+     * program.
+     *
+     * HTTP_COOKIE -- the browser's "Cookie:" header, e.g. "name=value"
+     * REQUEST_URI -- the requested path, e.g. "/page/index.html"
+     */
+    const char *cookie_hdr = getenv("HTTP_COOKIE");
+    const char *uri        = getenv("REQUEST_URI");
+
+    /*
+     * OpenBSD hardening: unveil() -- restrict filesystem access
+     *
+     * After this call the program may ONLY access DOCROOT.
+     * Attempts to open other directories (/etc, /home, ...) will fail
+     * with ENOENT, as if the files did not exist.
+     *
+     * "r" = read-only; no writing, no execution.
+     * unveil(NULL, NULL) finalises the configuration (no further paths).
+     */
+    if (unveil(DOCROOT, "r") == -1 ||
+        unveil(NULL, NULL)   == -1) {
+        send_error(500, "Internal Server Error");
+        return 1;
+    }
+
+    /*
+     * OpenBSD hardening: pledge() -- restrict permitted syscalls
+     *
+     * After this call the program is only allowed:
+     *   stdio  -- standard I/O (printf, fread, fwrite, ...)
+     *   rpath  -- read files (open, fstat, ...)
+     *
+     * All other syscalls (network, fork, exec, ...) are immediately
+     * terminated by the kernel with SIGABRT. This limits the damage
+     * if an attacker manages to inject code.
+     */
+    if (pledge("stdio rpath", NULL) == -1) {
+        send_error(500, "Internal Server Error");
+        return 1;
+    }
+
+    /* Fallback: if no URI was provided, assume the root page */
+    if (!uri) uri = "/";
+
+    /*
+     * Strip the query string.
+     * A URL may look like: /page/index.html?search=test&page=2
+     * The part starting at '?' is not needed for the file path.
+     */
+    char raw_uri[PATH_MAX];
+    snprintf(raw_uri, sizeof(raw_uri), "%s", uri);
+    char *qs = strchr(raw_uri, '?');
+    if (qs) *qs = '\0'; /* truncate the string at '?' */
+
+    /*
+     * Decode percent-encoding.
+     * "/page%2Ftest%20foo" becomes "/page/test foo".
+     */
+    char decoded_uri[PATH_MAX];
+    url_decode(decoded_uri, raw_uri, sizeof(decoded_uri));
+
+    /* First plausibility check of the path */
+    if (sanitize_uri(decoded_uri) != 0) {
+        send_error(400, "Bad Request");
+        return 0;
+    }
+
+    /*
+     * Cookie check -- the core of the access control.
+     *
+     * If the cookie is missing or incorrect, we send the browser a small
+     * JavaScript snippet. It sets the cookie and reloads the page.
+     * On the second request the cookie will be present.
+     *
+     * Cache-Control: no-store -- prevents the browser from caching this
+     * response (otherwise it might skip the reload).
+     */
+    if (!check_bypass(cookie_hdr)) {
+        const char *body =
+            "<script>"
+            "document.cookie = \"Yogsototh_opens_the_door=1; Path=/;\";"
+            "window.location.reload();"
+            "</script>";
+
+        printf("Status: 418 I'm a teapot\r\n");
+        printf("Cache-Control: no-store\r\n");
+        printf("Content-Type: text/html\r\n");
+        printf("X-Content-Type-Options: nosniff\r\n");
+        printf("Content-Length: %zu\r\n\r\n%s", strlen(body), body);
+        return 0;
+    }
+
+    /*
+     * Build the file path.
+     * DOCROOT + URI, e.g.: /htdocs/asdfghasdfgh.de + /page/index.html
+     *                    = /htdocs/asdfghasdfgh.de/page/index.html
+     */
+    char path[PATH_MAX];
+    int r = snprintf(path, sizeof(path), "%s%s", DOCROOT, decoded_uri);
+    if (r < 0 || r >= (int)sizeof(path)) {
+        send_error(400, "Bad Request");
+        return 0;
+    }
+
+    /*
+     * Redirect directory requests to index.html.
+     * If the path ends with '/', append index.html.
+     * e.g.: /htdocs/.../page/ -> /htdocs/.../page/index.html
+     */
+    if (path[strlen(path) - 1] == '/') {
+        r = snprintf(path, sizeof(path), "%s%sindex.html", DOCROOT, decoded_uri);
+        if (r < 0 || r >= (int)sizeof(path)) {
+            send_error(400, "Bad Request");
+            return 0;
+        }
+    }
+
+    /*
+     * Canonicalise the path -- the central defence against path traversal.
+     *
+     * realpath() resolves all "..", ".", symlinks, etc. and returns the
+     * true absolute path. "/htdocs/site/../../etc/passwd" becomes
+     * "/etc/passwd" -- which we catch in the next step.
+     *
+     * If realpath() fails (file does not exist) we return 404.
+     */
+    char resolved[PATH_MAX];
+    if (realpath(path, resolved) == NULL) {
+        send_error(404, "Not Found");
+        return 0;
+    }
+
+    /*
+     * Prefix check: the resolved path must lie within DOCROOT.
+     *
+     * Example of an attack being blocked:
+     *   Attacker requests: /../../../etc/passwd
+     *   After realpath():  /etc/passwd
+     *   Prefix check:      does NOT start with "/htdocs/asdfghasdfgh.de/" -> 403
+     *
+     * The second condition (resolved[docroot_len] != '/') prevents a
+     * directory with a similar name (e.g. /htdocs/asdfghasdfgh.de-evil)
+     * from being falsely accepted as inside DOCROOT.
+     */
+    size_t docroot_len = strlen(DOCROOT);
+    if (strncmp(resolved, DOCROOT, docroot_len) != 0 ||
+        (resolved[docroot_len] != '/' && resolved[docroot_len] != '\0')) {
+        send_error(403, "Forbidden");
+        return 0;
+    }
+
+    /*
+     * Open the file.
+     *
+     * O_RDONLY   -- open for reading only, not writing
+     * O_NOFOLLOW -- if the resolved path itself is a symlink, open() fails.
+     *               This prevents a TOCTOU race condition: between realpath()
+     *               and open() an attacker could replace the file with a
+     *               symlink. O_NOFOLLOW closes that window.
+     */
+    int fd = open(resolved, O_RDONLY | O_NOFOLLOW);
+    if (fd == -1) {
+        send_error(403, "Forbidden");
+        return 0;
+    }
+
+    /*
+     * Read file metadata via the open file descriptor (not the path).
+     *
+     * Using fd instead of the path avoids another race condition.
+     *
+     * S_ISREG checks that it is a regular file (not a directory, device,
+     * or pipe). We only want to serve actual files.
+     */
+    struct stat st;
+    if (fstat(fd, &st) != 0 || !S_ISREG(st.st_mode)) {
+        close(fd);
+        send_error(404, "Not Found");
+        return 0;
+    }
+
+    /*
+     * Wrap the file descriptor in a FILE stream.
+     * fdopen() takes ownership of fd -- fclose() will also close fd.
+     */
+    FILE *f = fdopen(fd, "rb");
+    if (!f) {
+        close(fd);
+        send_error(500, "Internal Server Error");
+        return 0;
+    }
+
+    /*
+     * Send the HTTP response headers.
+     *
+     * Content-Length: tells the browser how many bytes follow.
+     *   st.st_size is of type off_t (may be 64-bit), so cast to long long
+     *   for a portable printf format specifier.
+     *
+     * X-Frame-Options: DENY -- prevents the page from being embedded in an
+     *   <iframe> (protection against clickjacking).
+     *
+     * The blank line (\r\n\r\n) separates the headers from the body.
+     */
+    printf("Status: 200 OK\r\n");
+    printf("Content-Type: %s\r\n", get_mime_type(resolved));
+    printf("X-Content-Type-Options: nosniff\r\n");
+    printf("X-Frame-Options: DENY\r\n");
+    printf("Content-Length: %lld\r\n\r\n", (long long)st.st_size);
+    fflush(stdout); /* flush headers immediately before sending the body */
+
+    /*
+     * Read the file in chunks and write to stdout.
+     * BUF_SIZE-byte blocks are a good trade-off between memory usage
+     * and the number of syscalls required.
+     */
+    char buf[BUF_SIZE];
+    size_t n;
+    while ((n = fread(buf, 1, sizeof(buf), f)) > 0)
+        fwrite(buf, 1, n, stdout);
+
+    fclose(f); /* also closes the underlying fd */
+    return 0;
+}