filed

Job queue using FUSE

git clone git://mccd.space/filed

commit d0992dccfc635d431446e06b65e5d5b412f3b360
parent 8e3db53500da9e2a8ddeb2950552744d3b5fe5f7
Author: Marc Coquand <marc@coquand.email>
Date:   Tue, 16 Dec 2025 15:53:21 +0100

*

Diffstat:
MREADME.md | 23++++++++++++++---------
Mfiled.1.scd | 15++++++++++-----
Mjobdir.go | 4++--
Mmain.go | 1+
Mmanager.go | 2+-
Mpendingdir.go | 2+-
Mstore/jobs.go | 5+++--
7 files changed, 32 insertions(+), 20 deletions(-)
diff --git a/README.md b/README.md
@@ -36,36 +36,41 @@ $ mkdir /tmp/filed-jobs
 $ filed /tmp/filed-jobs
 ```
 
-Once run, `filed` will set up a directory in `filed-jobs` that contains a few files and directories.
+`filed` mounts the directory `filed-jobs` and exposes a few files and directories.
 
-A job can easily be added by just creating a file in the pending directory:
+A job can then be added by creating a file in the pending directory:
 
 ```
 $ printf "echo 'hello world'" > /tmp/filed-jobs/pending/1
 ```
 
-If all went well, you can see the job output:
+If all went well, you can see the job output in `/complete`:
 
 ```
 $ cat /tmp/filed-jobs/complete/1
 ```
 
-By default, a job retries 3 times, and if unsuccessful is moved to the `failed` directory. You can inspect the logs to see what went wrong:
+By default, a job retries 3 times, and if unsuccessful, it is moved to the `failed` directory. You can inspect the logs to see what went wrong:
 
 ```
-$ cat /tmp/filed-jobs/failed/1
+$ cat /tmp/filed-jobs/failed/2
+>>> ech this-will-fail
+sh: 1: ech: not found
+
+
+[System Error]: exit status 127
 ```
 
 And you can restart a job by moving the job back to pending:
 
 ```
-$ mv /tmp/filed-jobs/failed/1 /tmp/filed-jobs/pending
+$ mv /tmp/filed-jobs/failed/2 /tmp/filed-jobs/pending
 ```
 
 Finally, if you want to remove a completed or failed jobs:
 
 ```
-$ rm /tmp/filed-jobs/failed/1
+$ rm /tmp/filed-jobs/failed/2
 ```
 
 ## Design & Motivation
@@ -76,9 +81,9 @@ Often these jobs can fail, whether that's due to network errors, memory issues o
 
 I wanted a tool that I could incorporate and use with whatever programming language I desired, and that makes it easy to understand when a job fail and rerun jobs if there is an error. `filed` is very intuitive to build an integration for: just write a file telling it what to execute.
 
-I also wanted a tool that made it simple to inspect, without needing to expose a web portal or set up separate auth system. `filed` allows you to inspect and operate the queue just by SSHing into the server, and reuses the decades old proven identity system already built into Linux.
+I also wanted a tool that made it simple to inspect, without needing to expose a admin portal with separate sign in. `filed` allows you to inspect and operate the queue just by SSHing into the server, and reuses the decades old identity system already built into Linux.
 
-The simple file-based API of File d'attente, inspired by plan9, also allows me to slim down the amount of code needed considerably, while still exposing a very scriptable and easy-to-understand interface.
+The simple file-based API of File d'attente, also allows me to slim down the amount of code needed to write considerably, while still exposing a very scriptable and easy-to-understand interface.
 
 I've tried a few other queue tools: sqs/sns, rabbitmq, bull, systemd-run. The first two felt heavyweight, and required setting up a lot of infrastructure, especially if you want to rerun and inspect jobs. It felt like far too much work for a simple app. Bull was more in line with what I wanted, but I think operating on files is simpler for building custom automation, and easier to secure. Systemd-run lacked the retry functionality and the interface was rather clunky.
 
diff --git a/filed.1.scd b/filed.1.scd
@@ -11,11 +11,12 @@ filed - queue jobs utility
 # DESCRIPTION
 
 filed (file d'attente) is an inspectable job queue that operates on files
-with retries. It mounts a directory to _mountpoint_ that is used to inspect
-and run jobs.
+with retries. It mounts a directory to _mountpoint_, which is where the user
+can add and inspect jobs.
 
-filed exposes 4 directories, where each directory contains zero or more _jobs_.
-Job names must be unique across all four directories. The directories are:
+filed exposes 4 directories to _mountpoint_, where each directory contains
+zero or more _jobs_. Job names must be unique across all four directories. The
+directories are:
 
 	*pending* - jobs to be run. To create a new job, create a file 
 	here with the command to run.
@@ -54,6 +55,9 @@ has access to the state, and is thus able to rewrite access rights. It is
 recommended for the running scripts to use _namespaces(7)_ or _Landlock(7)_
 to drop privileges. More security features are coming in the future.
 
+Another aspect to be aware of is that File d'attente stores logs of all jobs.
+Care should be taken to ensure that no secrets are printed.
+
 Access rights can be modified using _CHOWN(1)_ and _CHMOD(1)_. 
 
 # MAINTENANCE
@@ -74,7 +78,8 @@ Maximum amount of retries before moving the job to failed.
 
 ## Max job count
 
-Maximum amount of concurrent jobs
+Maximum amount of concurrent jobs. It is recommended to not set this much
+higher than 20.
 
 ## Backoff mult and backoff base
 
diff --git a/jobdir.go b/jobdir.go
@@ -104,14 +104,14 @@ func (d JobDir) Rename(ctx context.Context, req *fuse.RenameRequest, newDir fs.N
 }
 
 func (jd JobDir) Lookup(ctx context.Context, name string) (fs.Node, error) {
-	slog.Info("FUSE: Jobdir Lookup", "name", name)
+	slog.Debug("FUSE: Jobdir Lookup", "name", name)
 	job, err := jd.manager.store.GetJob(name)
 	if err != nil {
 		slog.Warn("FUSE: Not found", "name", name)
 		return nil, syscall.ENOENT
 	}
 	if job.State == jd.state {
-		slog.Info("FUSE: Found job", "id", job.ID)
+		slog.Debug("FUSE: Found job", "id", job.ID)
 		return &File{job, jd.manager}, nil
 	} else {
 		return nil, syscall.ENOENT
diff --git a/main.go b/main.go
@@ -32,6 +32,7 @@ func main() {
 		xdg_home := os.Getenv("XDG_DATA_HOME")
 		if xdg_home == "" {
 			fmt.Fprintf(os.Stderr, "FILED_STATE_FILE environment variable needs to be set.\n")
+			fmt.Fprintf(os.Stderr, "For example: export FILED_STATE_FILE=$HOME/.local/share/filed.db")
 			usage()
 			os.Exit(1)
 		}
diff --git a/manager.go b/manager.go
@@ -129,7 +129,7 @@ func (jm *JobManager) runJob(id, commandStr string) {
 		jobOutput = append(jobOutput, []byte(errMsg)...)
 		jm.store.RestartJob(id, jobOutput)
 	} else {
-		slog.Info("Worker: Job completed", "id", id, "Output", jobOutput, "exitCode", exitCode)
+		slog.Info("Worker: Job completed", "id", id, "exitCode", exitCode)
 		jm.store.CompleteJob(id, jobOutput)
 	}
 }
diff --git a/pendingdir.go b/pendingdir.go
@@ -102,7 +102,7 @@ type File struct {
 func (f File) Attr(ctx context.Context, a *fuse.Attr) error {
 	// Append 20 to avoid collission with static files
 	a.Inode = uint64(f.job.INode + 20)
-	slog.Info("FUSE", "inode", a.Inode)
+	slog.Debug("FUSE", "inode", a.Inode)
 	a.Mode = 0o775
 	a.Gid = uint32(os.Getgid())
 	a.Uid = uint32(os.Getuid())
diff --git a/store/jobs.go b/store/jobs.go
@@ -40,7 +40,7 @@ func NewStore(filepath string) (*Store, error) {
 		return nil, err
 	}
 
-	if _, err := db.Exec("PRAGMA journal_mode=WAL;"); err != nil {
+	if _, err := db.Exec("PRAGMA journal_mode=WAL;PRAGMA busy_timeout=5000;"); err != nil {
 		return nil, err
 	}
 
@@ -147,7 +147,8 @@ func (s *Store) DeleteJob(id string) error {
 }
 
 func (s *Store) ListJobsByState(state string) ([]Job, error) {
-	rows, err := s.db.Query("SELECT id, command, attempts, created_at,updated_at FROM jobs WHERE state = ?", state)
+	// Since it's a queue, it should be first in first out.
+	rows, err := s.db.Query("SELECT id, command, attempts, created_at,updated_at FROM jobs WHERE state = ? ORDER BY created_at", state)
 	if err != nil {
 		return nil, err
 	}