From 750840146284b9a8dc041a8cb687e1f0f2380bf8 Mon Sep 17 00:00:00 2001 From: AKP Date: Thu, 19 Jan 2023 15:10:48 +0000 Subject: Support checking ETags on RSS feeds Signed-off-by: AKP --- go.mod | 2 +- go.sum | 2 ++ walrss/internal/db/db.go | 5 +++++ walrss/internal/rss/processor.go | 48 +++++++++++++++++++++++++++++++++++----- 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 19a041d..d9ffdfc 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.19 require ( github.com/bwmarrin/go-alone v0.0.0-20190806015146-742bb55d1631 - github.com/carlmjohnson/requests v0.22.2 + github.com/carlmjohnson/requests v0.22.3 github.com/coreos/go-oidc v2.2.1+incompatible github.com/gofiber/fiber/v2 v2.31.0 github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible diff --git a/go.sum b/go.sum index f35d214..b1ba26d 100644 --- a/go.sum +++ b/go.sum @@ -17,6 +17,8 @@ github.com/bwmarrin/go-alone v0.0.0-20190806015146-742bb55d1631 h1:Xb5rra6jJt5Z1 github.com/bwmarrin/go-alone v0.0.0-20190806015146-742bb55d1631/go.mod h1:P86Dksd9km5HGX5UMIocXvX87sEp2xUARle3by+9JZ4= github.com/carlmjohnson/requests v0.22.2 h1:hccG5g9ITJlnDip54OVa810AkB366kthFjvA90N4owM= github.com/carlmjohnson/requests v0.22.2/go.mod h1:Hw4fFOk3xDlHQbNRTGo4oc52TUTpVEq93sNy/H+mrQM= +github.com/carlmjohnson/requests v0.22.3 h1:ip16AKXNYuArdw9L5/1mL+mNorlZO5XhkLg617yOumc= +github.com/carlmjohnson/requests v0.22.3/go.mod h1:iTsaX9TdFg2+L4WtZO/HFyDMPEfBnogV3i4A4gjDnvs= github.com/coreos/go-oidc v2.2.1+incompatible h1:mh48q/BqXqgjVHpy2ZY7WnWAbenxRjsz9N1i1YxjHAk= github.com/coreos/go-oidc v2.2.1+incompatible/go.mod h1:CgnwVTmzoESiwO9qyAFEMiHoZ1nMCKZlZ9V6mm3/LKc= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= diff --git a/walrss/internal/db/db.go b/walrss/internal/db/db.go index f94aa4f..6ee09cb 100644 --- a/walrss/internal/db/db.go +++ b/walrss/internal/db/db.go @@ -49,6 +49,11 @@ type Feed struct { User *User `bun:",rel:belongs-to,join:user_id=id"` } +func (f *Feed) CacheWithEtag(etag, content string) { + f.LastEtag = etag + f.CachedContent = content +} + type FeedSlice []*Feed func (f FeedSlice) Len() int { diff --git a/walrss/internal/rss/processor.go b/walrss/internal/rss/processor.go index 8e34466..151eb06 100644 --- a/walrss/internal/rss/processor.go +++ b/walrss/internal/rss/processor.go @@ -15,7 +15,9 @@ import ( "github.com/mmcdole/gofeed" "github.com/patrickmn/go-cache" "github.com/rs/zerolog/log" + "net/http" "net/smtp" + "net/textproto" "sort" "strings" "sync" @@ -97,7 +99,7 @@ func ProcessUserFeed(st *state.State, user *db.User, progressChannel chan string pf := new(processedFeed) pf.Name = f.Name - rawFeed, err := getFeedContent(f.URL) + rawFeed, err := getFeedContent(st, f) if err != nil { pf.Error = err reportProgress(progressChannel, "Failed to fetch: "+err.Error()) @@ -148,11 +150,11 @@ var ( feedFetchLock = new(sync.Mutex) ) -func getFeedContent(url string) (*gofeed.Feed, error) { +func getFeedContent(st *state.State, f *db.Feed) (*gofeed.Feed, error) { feedFetchLock.Lock() defer feedFetchLock.Unlock() - if v, found := feedCache.Get(url); found { + if v, found := feedCache.Get(f.URL); found { return v.(*gofeed.Feed), nil } @@ -160,16 +162,52 @@ func getFeedContent(url string) (*gofeed.Feed, error) { ctx, cancel := context.WithTimeout(context.Background(), time.Second*10) defer cancel() - if err := requests.URL(url).ToBytesBuffer(buf).UserAgent(userAgent).Fetch(ctx); err != nil { + var notModified bool + headers := make(textproto.MIMEHeader) + + requestBuilder := requests.URL(f.URL).ToBytesBuffer(buf).UserAgent(userAgent).CopyHeaders(headers) + + if f.LastEtag != "" { + requestBuilder.AddValidator( + func(resp *http.Response) error { + if resp.StatusCode == http.StatusNotModified { + notModified = true + return nil + } else { + return requests.DefaultValidator(resp) + } + }, + ) + requestBuilder.Header("If-None-Match", f.LastEtag) + } else { + requestBuilder.AddValidator(requests.DefaultValidator) // Since we're using CopyHeaders, we need to add the + // default validator back ourselves. + } + + if err := requestBuilder.Fetch(ctx); err != nil { return nil, err } + if notModified { + log.Debug().Msgf("%s not modified", f.URL) + buf.WriteString(f.CachedContent) + } else { + log.Debug().Msgf("%s modified", f.URL) + etag := headers.Get("ETag") + if etag != "" { + f.CacheWithEtag(etag, buf.String()) + if err := core.UpdateFeed(st, f); err != nil { + return nil, fmt.Errorf("failed to cache ETag-ed response: %v", err) + } + } + } + feed, err := gofeed.NewParser().Parse(buf) if err != nil { return nil, err } - _ = feedCache.Add(url, feed, cache.DefaultExpiration) + _ = feedCache.Add(f.URL, feed, cache.DefaultExpiration) return feed, nil } -- cgit v1.2.3-70-g09d2