117 lines
3.1 KiB
Go
117 lines
3.1 KiB
Go
package scraper
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type Video struct {
|
|
Title string
|
|
URL string
|
|
Channel string
|
|
Duration string
|
|
UploadDate string
|
|
}
|
|
|
|
// Updated regular expressions to match the shell script
|
|
var (
|
|
titleRegex = regexp.MustCompile(`"title":\{"runs":\[\{"text":"([^"]+)"\}\]`)
|
|
channelRegex = regexp.MustCompile(`"ownerText":\{"runs":\[\{"text":"([^"]+)"\}\]`)
|
|
durationRegex = regexp.MustCompile(`"lengthText":\{"accessibility":\{"accessibilityData":\{"label":"[^"]*"\}\},"simpleText":"([^"]+)"`)
|
|
uploadDateRegex = regexp.MustCompile(`"publishedTimeText":\{"simpleText":"([^"]+)"\}`)
|
|
videoIDRegex = regexp.MustCompile(`watch\?v=([^"]+)`)
|
|
)
|
|
|
|
func FetchVideos(query string) ([]Video, error) {
|
|
client := &http.Client{
|
|
Timeout: 10 * time.Second,
|
|
}
|
|
|
|
// Format URL similar to the shell script
|
|
searchURL := fmt.Sprintf("https://www.youtube.com/results?search_query=%s",
|
|
url.QueryEscape(strings.ReplaceAll(query, " ", "+")))
|
|
|
|
fmt.Printf("Fetching: %s\n", searchURL) // Debug print
|
|
|
|
req, err := http.NewRequest("GET", searchURL, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error creating request: %w", err)
|
|
}
|
|
|
|
// Add headers to mimic a browser
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error making request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error reading response: %w", err)
|
|
}
|
|
content := string(body)
|
|
|
|
// Extract information
|
|
titles := titleRegex.FindAllStringSubmatch(content, -1)
|
|
channels := channelRegex.FindAllStringSubmatch(content, -1)
|
|
durations := durationRegex.FindAllStringSubmatch(content, -1)
|
|
uploadDates := uploadDateRegex.FindAllStringSubmatch(content, -1)
|
|
videoIDs := videoIDRegex.FindAllStringSubmatch(content, -1)
|
|
|
|
fmt.Printf("Found: %d titles, %d channels, %d durations, %d dates, %d IDs\n",
|
|
len(titles), len(channels), len(durations), len(uploadDates), len(videoIDs))
|
|
|
|
var videos []Video
|
|
for i := 0; i < len(titles) && i < 10; i++ { // Limit to 10 results like the shell script
|
|
if i >= len(videoIDs) {
|
|
break
|
|
}
|
|
|
|
video := Video{
|
|
Title: unescapeHTML(titles[i][1]),
|
|
URL: fmt.Sprintf("https://www.youtube.com/watch?v=%s", videoIDs[i][1]),
|
|
}
|
|
|
|
if i < len(channels) {
|
|
video.Channel = unescapeHTML(channels[i][1])
|
|
}
|
|
if i < len(durations) {
|
|
video.Duration = durations[i][1]
|
|
}
|
|
if i < len(uploadDates) {
|
|
video.UploadDate = uploadDates[i][1]
|
|
}
|
|
|
|
videos = append(videos, video)
|
|
}
|
|
|
|
if len(videos) == 0 {
|
|
return nil, fmt.Errorf("no videos found")
|
|
}
|
|
|
|
return videos, nil
|
|
}
|
|
|
|
func unescapeHTML(s string) string {
|
|
replacements := map[string]string{
|
|
"\\u0026": "&",
|
|
"\\\"": "\"",
|
|
"\\u003c": "<",
|
|
"\\u003e": ">",
|
|
""": "\"",
|
|
"'": "'",
|
|
}
|
|
|
|
for old, new := range replacements {
|
|
s = strings.ReplaceAll(s, old, new)
|
|
}
|
|
return s
|
|
}
|