ytgo/scraper/scraper.go
2024-12-12 11:37:43 +01:00

117 lines
3.1 KiB
Go

package scraper
import (
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
)
type Video struct {
Title string
URL string
Channel string
Duration string
UploadDate string
}
// Updated regular expressions to match the shell script
var (
titleRegex = regexp.MustCompile(`"title":\{"runs":\[\{"text":"([^"]+)"\}\]`)
channelRegex = regexp.MustCompile(`"ownerText":\{"runs":\[\{"text":"([^"]+)"\}\]`)
durationRegex = regexp.MustCompile(`"lengthText":\{"accessibility":\{"accessibilityData":\{"label":"[^"]*"\}\},"simpleText":"([^"]+)"`)
uploadDateRegex = regexp.MustCompile(`"publishedTimeText":\{"simpleText":"([^"]+)"\}`)
videoIDRegex = regexp.MustCompile(`watch\?v=([^"]+)`)
)
func FetchVideos(query string) ([]Video, error) {
client := &http.Client{
Timeout: 10 * time.Second,
}
// Format URL similar to the shell script
searchURL := fmt.Sprintf("https://www.youtube.com/results?search_query=%s",
url.QueryEscape(strings.ReplaceAll(query, " ", "+")))
fmt.Printf("Fetching: %s\n", searchURL) // Debug print
req, err := http.NewRequest("GET", searchURL, nil)
if err != nil {
return nil, fmt.Errorf("error creating request: %w", err)
}
// Add headers to mimic a browser
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("error making request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("error reading response: %w", err)
}
content := string(body)
// Extract information
titles := titleRegex.FindAllStringSubmatch(content, -1)
channels := channelRegex.FindAllStringSubmatch(content, -1)
durations := durationRegex.FindAllStringSubmatch(content, -1)
uploadDates := uploadDateRegex.FindAllStringSubmatch(content, -1)
videoIDs := videoIDRegex.FindAllStringSubmatch(content, -1)
fmt.Printf("Found: %d titles, %d channels, %d durations, %d dates, %d IDs\n",
len(titles), len(channels), len(durations), len(uploadDates), len(videoIDs))
var videos []Video
for i := 0; i < len(titles) && i < 10; i++ { // Limit to 10 results like the shell script
if i >= len(videoIDs) {
break
}
video := Video{
Title: unescapeHTML(titles[i][1]),
URL: fmt.Sprintf("https://www.youtube.com/watch?v=%s", videoIDs[i][1]),
}
if i < len(channels) {
video.Channel = unescapeHTML(channels[i][1])
}
if i < len(durations) {
video.Duration = durations[i][1]
}
if i < len(uploadDates) {
video.UploadDate = uploadDates[i][1]
}
videos = append(videos, video)
}
if len(videos) == 0 {
return nil, fmt.Errorf("no videos found")
}
return videos, nil
}
func unescapeHTML(s string) string {
replacements := map[string]string{
"\\u0026": "&",
"\\\"": "\"",
"\\u003c": "<",
"\\u003e": ">",
"&quot;": "\"",
"&#39;": "'",
}
for old, new := range replacements {
s = strings.ReplaceAll(s, old, new)
}
return s
}