commit - /dev/null
commit + d2898c58887876902d86df71013d58fdc187b9ee
blob - /dev/null
blob + 74bb1492d7c4e50ea3def3a724b49552221920ba (mode 644)
--- /dev/null
+++ README.md
+# auth-gate
+
+Inspired by
+https://her.esy.fun/posts/0031-how-i-protect-my-forgejo-instance-from-ai-web-crawlers/index.html
+
+Needs `{gcc,llvm}` and `{fast,slow}cgi`
+
+**DISCLAIMER**: Vibe-coded with an LLM. I'm not much of a programmer...
blob - /dev/null
blob + 8b09380d9eb2cd78abcc817ee69a162e10525727 (mode 644)
--- /dev/null
+++ auth-gate.c
+/*
+ * Copyright (c) 2025 Manuel Kuklinski
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * SPDX-License-Identifier: ISC
+ */
+
+/*
+ * auth-gate.c -- CGI access control for OpenBSD/slowcgi
+ *
+ * WHAT DOES THIS PROGRAM DO?
+ * --------------------------
+ * This program runs as a CGI script behind a web server (slowcgi on OpenBSD).
+ * When a browser requests a page, the web server starts this program and passes
+ * information about the HTTP request via environment variables (e.g. which URL
+ * was requested, which cookies were sent, etc.).
+ *
+ * The program checks whether the browser sends a specific cookie.
+ * - Cookie present -> serve the file from the document root directory.
+ * - Cookie missing -> send a JavaScript snippet that sets the cookie and
+ * reloads the page (the browser does this automatically).
+ *
+ * SECURITY MEASURES (OpenBSD-specific):
+ * - unveil(): restricts filesystem access to DOCROOT (read-only).
+ * - pledge(): restricts all permitted syscalls to stdio + file access.
+ * - realpath() + prefix check: prevents path traversal attacks.
+ * - O_NOFOLLOW: prevents symbolic links from being exploited.
+ *
+ * BUILD (OpenBSD):
+ * cc -O2 -Wall -Wextra -o auth-gate auth-gate.c
+ */
+
+#include <stdio.h> /* printf(), fread(), fwrite(), fclose() */
+#include <stdlib.h> /* getenv(), strtol() */
+#include <string.h> /* strlen(), strncmp(), strchr(), strstr(), strcmp() */
+#include <strings.h> /* strcasecmp() -- case-insensitive comparison */
+#include <sys/stat.h> /* fstat(), struct stat, S_ISREG() */
+#include <unistd.h> /* unveil(), pledge(), close() */
+#include <fcntl.h> /* open(), O_RDONLY, O_NOFOLLOW */
+#include <limits.h> /* PATH_MAX -- maximum path length of the OS */
+
+/* --- Configuration constants --------------------------------------------- */
+
+/* Directory from which files may be served.
+ * All requests must remain within this path. */
+#define DOCROOT "/htdocs/asdfghasdfgh.de"
+
+/* Name and expected value of the access cookie. */
+#define COOKIE_NAME "Yogsototh_opens_the_door"
+#define COOKIE_VALUE "1"
+
+/* Buffer size for reading and sending files (8 KiB is sufficient). */
+#define BUF_SIZE 8192
+
+/* --- Helper functions ----------------------------------------------------- */
+
+/*
+ * url_decode -- decodes a percent-encoded URL string
+ *
+ * Background: many characters are not allowed directly in URLs.
+ * Instead they are encoded as %XX (XX = hexadecimal value).
+ * Example: %2F stands for '/', %20 for a space character.
+ *
+ * Parameters:
+ * dst -- destination buffer where the result is written
+ * src -- input URL (e.g. "/page%2Ftest")
+ * dstsize -- size of the destination buffer (prevents buffer overflow)
+ *
+ * Security:
+ * - Null bytes (val == 0x00) are rejected because they would terminate
+ * C strings early and could bypass security checks.
+ * - Control characters (val < 0x20) and DEL (0x7F) are rejected.
+ * - Slashes (%2F) are passed through -- realpath() secures the path
+ * reliably afterwards.
+ */
+static void url_decode(char *dst, const char *src, size_t dstsize) {
+ if (dstsize == 0)
+ return;
+
+ size_t i = 0; /* write position in the destination buffer */
+
+ while (*src && i < dstsize - 1) { /* -1: reserve space for terminating '\0' */
+ if (*src == '%' && src[1] && src[2]) {
+ /* The next two characters are the hex value of the encoded byte */
+ char hex[3] = { src[1], src[2], '\0' };
+ char *end;
+ long val = strtol(hex, &end, 16); /* hexadecimal -> integer */
+
+ /* Only accept valid, safe characters */
+ if (*end == '\0' && val > 0x1f && val != 0x7f)
+ dst[i++] = (char)val;
+
+ src += 3; /* skip '%' and both hex digits */
+ } else {
+ /* Copy regular character directly */
+ dst[i++] = *src++;
+ }
+ }
+ dst[i] = '\0'; /* properly terminate the C string */
+}
+
+/*
+ * sanitize_uri -- basic plausibility check of the request path
+ *
+ * This function catches obvious attack patterns early.
+ * The actual, reliable protection against path traversal is provided
+ * later by realpath() + prefix check.
+ *
+ * Rejected:
+ * - Paths that do not start with '/' (invalid HTTP URIs)
+ * - Paths containing "/../" or ending with "/.." (directory traversal)
+ * - The bare string ".."
+ *
+ * Returns 0 if the path looks clean, -1 if suspicious.
+ */
+static int sanitize_uri(const char *uri) {
+ /* HTTP URIs must start with '/' */
+ if (uri[0] != '/')
+ return -1;
+
+ /* "/../" anywhere in the path, or bare ".." -> reject */
+ if (strstr(uri, "/../") || strcmp(uri, "..") == 0)
+ return -1;
+
+ /* Path ends with "/.." -> reject */
+ size_t len = strlen(uri);
+ if (len >= 3 && strcmp(uri + len - 3, "/..") == 0)
+ return -1;
+
+ return 0; /* path looks clean */
+}
+
+/*
+ * check_bypass -- checks whether the browser sends the access cookie
+ *
+ * CGI programs receive HTTP headers as environment variables.
+ * The "Cookie:" header is available in the HTTP_COOKIE variable, e.g.:
+ * "session=abc; Yogsototh_opens_the_door=1; theme=dark"
+ *
+ * The function iterates over the semicolon-separated cookie pairs
+ * and looks for: COOKIE_NAME=COOKIE_VALUE
+ *
+ * Returns 1 if the cookie is present and correct, 0 otherwise.
+ */
+static int check_bypass(const char *cookie_hdr) {
+ if (!cookie_hdr)
+ return 0; /* no Cookie header -> no access */
+
+ /* Reject excessively long Cookie headers as a potential attack */
+ if (strlen(cookie_hdr) > 4096)
+ return 0;
+
+ const char *p = cookie_hdr;
+ size_t namelen = strlen(COOKIE_NAME);
+
+ /* Scan the cookie string left to right */
+ while (*p) {
+ /* Skip whitespace after semicolons */
+ while (*p == ' ') p++;
+
+ /* Check whether the current cookie matches our name */
+ if (strncmp(p, COOKIE_NAME, namelen) == 0 && p[namelen] == '=') {
+ p += namelen + 1; /* advance past '=' */
+ size_t vallen = strlen(COOKIE_VALUE);
+
+ /* Verify the value -- the character after it must be '\0', ';',
+ * or ' ' to prevent "Yogsototh_opens_the_door=10" from matching */
+ if (strncmp(p, COOKIE_VALUE, vallen) == 0 &&
+ (p[vallen] == '\0' || p[vallen] == ';' || p[vallen] == ' '))
+ return 1; /* cookie found and value matches */
+ }
+
+ /* Advance to the next cookie pair (past the next ';') */
+ while (*p && *p != ';') p++;
+ if (*p == ';') p++;
+ }
+
+ return 0; /* cookie not found or value incorrect */
+}
+
+/*
+ * get_mime_type -- determines the MIME type from the file extension
+ *
+ * Browsers need the MIME type to know how to handle a file
+ * (render HTML, display an image, open a PDF, etc.).
+ *
+ * Returns a constant string, e.g. "text/html; charset=utf-8".
+ * Falls back to "application/octet-stream" for unknown extensions
+ * (browsers treat this as an unknown binary file).
+ */
+static const char *get_mime_type(const char *path) {
+ /* strrchr finds the last '.' in the path -> file extension */
+ const char *ext = strrchr(path, '.');
+ if (!ext) return "application/octet-stream";
+ ext++; /* advance past the dot, e.g. "html" instead of ".html" */
+
+ /* Case-insensitive comparison (e.g. treat ".PNG" like ".png") */
+ if (strcasecmp(ext, "html") == 0 || strcasecmp(ext, "htm") == 0) return "text/html; charset=utf-8";
+ if (strcasecmp(ext, "css") == 0) return "text/css";
+ if (strcasecmp(ext, "js") == 0) return "application/javascript";
+ if (strcasecmp(ext, "json") == 0) return "application/json";
+ if (strcasecmp(ext, "png") == 0) return "image/png";
+ if (strcasecmp(ext, "jpg") == 0 ||
+ strcasecmp(ext, "jpeg") == 0) return "image/jpeg";
+ if (strcasecmp(ext, "gif") == 0) return "image/gif";
+ if (strcasecmp(ext, "svg") == 0) return "image/svg+xml";
+ if (strcasecmp(ext, "ico") == 0) return "image/x-icon";
+ if (strcasecmp(ext, "txt") == 0) return "text/plain; charset=utf-8";
+ if (strcasecmp(ext, "pdf") == 0) return "application/pdf";
+ if (strcasecmp(ext, "xml") == 0) return "application/xml";
+ if (strcasecmp(ext, "webp") == 0) return "image/webp";
+ if (strcasecmp(ext, "woff") == 0) return "font/woff";
+ if (strcasecmp(ext, "woff2")== 0) return "font/woff2";
+ return "application/octet-stream";
+}
+
+/*
+ * send_error -- sends an HTTP error response to the browser
+ *
+ * CGI programs communicate with the web server via stdout.
+ * The web server forwards the output to the browser.
+ *
+ * An HTTP response consists of:
+ * 1. Header lines (Status, Content-Type, ...)
+ * 2. A blank line (\r\n\r\n) as separator
+ * 3. The body (the actual content)
+ *
+ * Example for code=404, reason="Not Found":
+ * Status: 404 Not Found\r\n
+ * Content-Type: text/plain\r\n
+ * X-Content-Type-Options: nosniff\r\n
+ * Content-Length: 13\r\n
+ * \r\n
+ * 404 Not Found
+ */
+static void send_error(int code, const char *reason) {
+ char body[64];
+ snprintf(body, sizeof(body), "%d %s", code, reason);
+
+ printf("Status: %d %s\r\n", code, reason);
+ printf("Content-Type: text/plain\r\n");
+ /* Prevents the browser from interpreting the content differently */
+ printf("X-Content-Type-Options: nosniff\r\n");
+ printf("Content-Length: %zu\r\n\r\n%s", strlen(body), body);
+}
+
+/* --- Main program --------------------------------------------------------- */
+
+int main(void) {
+ /*
+ * Read CGI environment variables.
+ * The web server populates these automatically before starting the CGI
+ * program.
+ *
+ * HTTP_COOKIE -- the browser's "Cookie:" header, e.g. "name=value"
+ * REQUEST_URI -- the requested path, e.g. "/page/index.html"
+ */
+ const char *cookie_hdr = getenv("HTTP_COOKIE");
+ const char *uri = getenv("REQUEST_URI");
+
+ /*
+ * OpenBSD hardening: unveil() -- restrict filesystem access
+ *
+ * After this call the program may ONLY access DOCROOT.
+ * Attempts to open other directories (/etc, /home, ...) will fail
+ * with ENOENT, as if the files did not exist.
+ *
+ * "r" = read-only; no writing, no execution.
+ * unveil(NULL, NULL) finalises the configuration (no further paths).
+ */
+ if (unveil(DOCROOT, "r") == -1 ||
+ unveil(NULL, NULL) == -1) {
+ send_error(500, "Internal Server Error");
+ return 1;
+ }
+
+ /*
+ * OpenBSD hardening: pledge() -- restrict permitted syscalls
+ *
+ * After this call the program is only allowed:
+ * stdio -- standard I/O (printf, fread, fwrite, ...)
+ * rpath -- read files (open, fstat, ...)
+ *
+ * All other syscalls (network, fork, exec, ...) are immediately
+ * terminated by the kernel with SIGABRT. This limits the damage
+ * if an attacker manages to inject code.
+ */
+ if (pledge("stdio rpath", NULL) == -1) {
+ send_error(500, "Internal Server Error");
+ return 1;
+ }
+
+ /* Fallback: if no URI was provided, assume the root page */
+ if (!uri) uri = "/";
+
+ /*
+ * Strip the query string.
+ * A URL may look like: /page/index.html?search=test&page=2
+ * The part starting at '?' is not needed for the file path.
+ */
+ char raw_uri[PATH_MAX];
+ snprintf(raw_uri, sizeof(raw_uri), "%s", uri);
+ char *qs = strchr(raw_uri, '?');
+ if (qs) *qs = '\0'; /* truncate the string at '?' */
+
+ /*
+ * Decode percent-encoding.
+ * "/page%2Ftest%20foo" becomes "/page/test foo".
+ */
+ char decoded_uri[PATH_MAX];
+ url_decode(decoded_uri, raw_uri, sizeof(decoded_uri));
+
+ /* First plausibility check of the path */
+ if (sanitize_uri(decoded_uri) != 0) {
+ send_error(400, "Bad Request");
+ return 0;
+ }
+
+ /*
+ * Cookie check -- the core of the access control.
+ *
+ * If the cookie is missing or incorrect, we send the browser a small
+ * JavaScript snippet. It sets the cookie and reloads the page.
+ * On the second request the cookie will be present.
+ *
+ * Cache-Control: no-store -- prevents the browser from caching this
+ * response (otherwise it might skip the reload).
+ */
+ if (!check_bypass(cookie_hdr)) {
+ const char *body =
+ "<script>"
+ "document.cookie = \"Yogsototh_opens_the_door=1; Path=/;\";"
+ "window.location.reload();"
+ "</script>";
+
+ printf("Status: 418 I'm a teapot\r\n");
+ printf("Cache-Control: no-store\r\n");
+ printf("Content-Type: text/html\r\n");
+ printf("X-Content-Type-Options: nosniff\r\n");
+ printf("Content-Length: %zu\r\n\r\n%s", strlen(body), body);
+ return 0;
+ }
+
+ /*
+ * Build the file path.
+ * DOCROOT + URI, e.g.: /htdocs/asdfghasdfgh.de + /page/index.html
+ * = /htdocs/asdfghasdfgh.de/page/index.html
+ */
+ char path[PATH_MAX];
+ int r = snprintf(path, sizeof(path), "%s%s", DOCROOT, decoded_uri);
+ if (r < 0 || r >= (int)sizeof(path)) {
+ send_error(400, "Bad Request");
+ return 0;
+ }
+
+ /*
+ * Redirect directory requests to index.html.
+ * If the path ends with '/', append index.html.
+ * e.g.: /htdocs/.../page/ -> /htdocs/.../page/index.html
+ */
+ if (path[strlen(path) - 1] == '/') {
+ r = snprintf(path, sizeof(path), "%s%sindex.html", DOCROOT, decoded_uri);
+ if (r < 0 || r >= (int)sizeof(path)) {
+ send_error(400, "Bad Request");
+ return 0;
+ }
+ }
+
+ /*
+ * Canonicalise the path -- the central defence against path traversal.
+ *
+ * realpath() resolves all "..", ".", symlinks, etc. and returns the
+ * true absolute path. "/htdocs/site/../../etc/passwd" becomes
+ * "/etc/passwd" -- which we catch in the next step.
+ *
+ * If realpath() fails (file does not exist) we return 404.
+ */
+ char resolved[PATH_MAX];
+ if (realpath(path, resolved) == NULL) {
+ send_error(404, "Not Found");
+ return 0;
+ }
+
+ /*
+ * Prefix check: the resolved path must lie within DOCROOT.
+ *
+ * Example of an attack being blocked:
+ * Attacker requests: /../../../etc/passwd
+ * After realpath(): /etc/passwd
+ * Prefix check: does NOT start with "/htdocs/asdfghasdfgh.de/" -> 403
+ *
+ * The second condition (resolved[docroot_len] != '/') prevents a
+ * directory with a similar name (e.g. /htdocs/asdfghasdfgh.de-evil)
+ * from being falsely accepted as inside DOCROOT.
+ */
+ size_t docroot_len = strlen(DOCROOT);
+ if (strncmp(resolved, DOCROOT, docroot_len) != 0 ||
+ (resolved[docroot_len] != '/' && resolved[docroot_len] != '\0')) {
+ send_error(403, "Forbidden");
+ return 0;
+ }
+
+ /*
+ * Open the file.
+ *
+ * O_RDONLY -- open for reading only, not writing
+ * O_NOFOLLOW -- if the resolved path itself is a symlink, open() fails.
+ * This prevents a TOCTOU race condition: between realpath()
+ * and open() an attacker could replace the file with a
+ * symlink. O_NOFOLLOW closes that window.
+ */
+ int fd = open(resolved, O_RDONLY | O_NOFOLLOW);
+ if (fd == -1) {
+ send_error(403, "Forbidden");
+ return 0;
+ }
+
+ /*
+ * Read file metadata via the open file descriptor (not the path).
+ *
+ * Using fd instead of the path avoids another race condition.
+ *
+ * S_ISREG checks that it is a regular file (not a directory, device,
+ * or pipe). We only want to serve actual files.
+ */
+ struct stat st;
+ if (fstat(fd, &st) != 0 || !S_ISREG(st.st_mode)) {
+ close(fd);
+ send_error(404, "Not Found");
+ return 0;
+ }
+
+ /*
+ * Wrap the file descriptor in a FILE stream.
+ * fdopen() takes ownership of fd -- fclose() will also close fd.
+ */
+ FILE *f = fdopen(fd, "rb");
+ if (!f) {
+ close(fd);
+ send_error(500, "Internal Server Error");
+ return 0;
+ }
+
+ /*
+ * Send the HTTP response headers.
+ *
+ * Content-Length: tells the browser how many bytes follow.
+ * st.st_size is of type off_t (may be 64-bit), so cast to long long
+ * for a portable printf format specifier.
+ *
+ * X-Frame-Options: DENY -- prevents the page from being embedded in an
+ * <iframe> (protection against clickjacking).
+ *
+ * The blank line (\r\n\r\n) separates the headers from the body.
+ */
+ printf("Status: 200 OK\r\n");
+ printf("Content-Type: %s\r\n", get_mime_type(resolved));
+ printf("X-Content-Type-Options: nosniff\r\n");
+ printf("X-Frame-Options: DENY\r\n");
+ printf("Content-Length: %lld\r\n\r\n", (long long)st.st_size);
+ fflush(stdout); /* flush headers immediately before sending the body */
+
+ /*
+ * Read the file in chunks and write to stdout.
+ * BUF_SIZE-byte blocks are a good trade-off between memory usage
+ * and the number of syscalls required.
+ */
+ char buf[BUF_SIZE];
+ size_t n;
+ while ((n = fread(buf, 1, sizeof(buf), f)) > 0)
+ fwrite(buf, 1, n, stdout);
+
+ fclose(f); /* also closes the underlying fd */
+ return 0;
+}