commit 9a10f4f97a9b4c1c66af89eceb584a4a0620b8a1
parent 4dfeed8311e77525f61f2df5a563c42da170ab64
Author: rra <rscmbbng@riseup.net>
Date: Wed Oct 5 21:16:02 +0200
now it actually downloads loops
Diffstat:download_loooooops.py | | | 128 | ++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------- |
1 file changed, 81 insertions(+), 47 deletions(-)
diff --git a/download_loooooops.py b/download_loooooops.py
@@ -1,62 +1,96 @@
import requests
from time import sleep
-
+import datetime
+import os
+from urllib.parse import urlparse
+import shutil
#def download_media(dir, url):
-# remote_url
-# description
+# remote_url
+# description
+
+output_dir = "/home/r/Programming/radio-looptober/loops"
+
+def grab_media(path, url, filename):
+
+ media_item = urlparse(url).path.split('/')[-1]
+ headers = {
+ 'User-Agent': 'https://git.vvvvvvaria.org/rra/radio-looptober',
+ 'From': 'post.lurk.org/@lurk' # This is another valid field
+ }
+
+ if not os.path.exists(os.path.join(path, media_item)):
+ response = requests.get(url, headers=headers, stream=True)
+ if response.ok:
+ with open(os.path.join(path, media_item), 'wb') as media_file:
+ shutil.copyfileobj(response.raw, media_file)
+ print('Downloaded media {} from {}'.format(media_item, urlparse(url).netloc))
+ return media_item
#This pages through all the looptober tag and collects the json in 'data'
there_is_more = True
url = "https://post.lurk.org/api/v1/timelines/tag/looptober"
data = []
while there_is_more:
- print("downloading", url)
- r = requests.get(url)
- print(r.status_code)
- if r.ok:
- if r.content:
-
- data.append(r.json())
- print(len(data))
- sleep(1)
-
- if r.links:
- url = r.links["next"]["url"]
- print("found next url", url)
-
- else:
- print("no more data")
- there_is_more = False
- break
- else:
- break
+ print("downloading", url)
+ r = requests.get(url)
+ print("response status: ", r.status_code)
+ if r.ok:
+ if r.content:
+
+ data.append(r.json())
+ print("amount of pages:", len(data))
+ sleep(0.5)
+
+ if r.links:
+ url = r.links["next"]["url"]
+ print("found next url", url)
+
+ else:
+ print("no more data")
+ there_is_more = False
+ break
+ else:
+ break
#this parses all the json, taking a few valuable fields and puts them in looooops
looooops = []
for collection in data:
- for i in collection:
- if i["media_attachments"]: #we only take entries that actually contain a sound file
- creation_date = datetime.datetime.fromisoformat(
- i['created_at'][:-1]).astimezone(
- datetime.timezone.utc)
-
- if creation_date.strftime('%Y') == "2022": #we only take entries from this year
- stuff = {}
- stuff["url"] = i["url"]
- stuff["description"] = i["content"]
- stuff["audio"] = i["media_attachments"]
- stuff["date"] = i["created_at"]
- stuff["id"] = i["id"]
- stuff["creator"] = i["account"]["username"]
- looooops.append(stuff)
- print("found post by {} with {} looops".format(
- i["account"]["username"],
- len(i["media_attachments"])))
-
-
-#for l in looooops:
- # create a folder per l, named id
- # download the files in media_attachments using the remote_url
- # find a way to stuff metadata in the file
+ for i in collection:
+ if i["media_attachments"]: #we only take entries that actually contain a sound file
+ creation_date = datetime.datetime.fromisoformat(
+ i['created_at'][:-1]).astimezone(
+ datetime.timezone.utc)
+
+ if creation_date.strftime('%Y') == "2022": #we only take entries from this year
+ stuff = {}
+ stuff["url"] = i["url"]
+ stuff["description"] = i["content"]
+ stuff["audio"] = i["media_attachments"]
+ stuff["date"] = i["created_at"]
+ stuff["id"] = i["id"]
+ stuff["creator"] = i["account"]["username"]
+ looooops.append(stuff)
+ print("found post by {} with {} looops".format(
+ i["account"]["username"],
+ len(i["media_attachments"])))
+
+if not os.path.exists(output_dir):
+ os.mkdir(output_dir)
+
+for l in looooops:
+ path = os.path.join(output_dir,"{}_{}".format(l['creator'], l['id']))
+ if not os.path.exists(path):
+ os.mkdir(path)
+
+ print("\n")
+ print("Downloading looops by ***{}***".format(l['creator']))
+ for a in l['audio']:
+ if a['remote_url']:
+ url = a['remote_url']
+ else:
+ url = a['url']
+
+ grab_media(path, url)
+